Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Telemetry Metrics Configuration. #18186

Merged
merged 22 commits into from
Feb 10, 2023
Merged
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
805c6ad
Telemetry Metrics Configuration.
kitography Dec 1, 2022
50b4809
Err Shadowing Fix (woah, semgrep is cool).
kitography Dec 1, 2022
0f5fe89
Fix TestBackend_RevokePlusTidy_Intermediate
kitography Dec 1, 2022
644291b
Add Changelog.
kitography Dec 1, 2022
e124d17
Fix memory leak. Code cleanup as suggested by Steve.
kitography Dec 5, 2022
173143b
Turn off metrics by default, breaking-change.
kitography Dec 5, 2022
39e7113
Merge branch 'main' into VAULT-8784-telemetry-turn-off-on-metrics-count
kitography Dec 12, 2022
50e02af
Show on tidy-status before start-up.
kitography Dec 13, 2022
38421eb
Fix tests
kitography Dec 13, 2022
617f8e3
make fmt
kitography Dec 13, 2022
dd21615
Merge branch 'main' into VAULT-8784-telemetry-turn-off-on-metrics-count
kitography Feb 10, 2023
27e5ca8
Add emit metrics to periodicFunc
kitography Feb 10, 2023
f4473ba
Test not delivering unavailable metrics + fix.
kitography Feb 10, 2023
bff0245
Better error message.
kitography Feb 10, 2023
eb646af
Fixing the false-error bug.
kitography Feb 10, 2023
945cf93
make fmt.
kitography Feb 10, 2023
32fb8f1
Try to fix race issue, remove confusing comments.
kitography Feb 10, 2023
f2a546d
Switch metric counter variables to an atomic.Uint32
stevendpclark Feb 10, 2023
82ca965
Fix race-issue better by trying until the metric is sunk.
kitography Feb 10, 2023
3cb6793
make fmt.
kitography Feb 10, 2023
ffc680a
empty commit to retrigger non-race tests that all pass locally
kitography Feb 10, 2023
eeeb864
Extend metrics, remove parallel run because I have no idea what else …
kitography Feb 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 117 additions & 42 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
@@ -9,12 +9,12 @@ import (
"sync/atomic"
"time"

atomic2 "go.uber.org/atomic"

"github.com/hashicorp/vault/helper/constants"

"github.com/hashicorp/go-multierror"

atomic2 "go.uber.org/atomic"

"github.com/hashicorp/vault/sdk/helper/consts"

"github.com/armon/go-metrics"
@@ -244,9 +244,12 @@ func Backend(conf *logical.BackendConfig) *backend {
b.lastTidy = time.Now()

// Metrics initialization for count of certificates in storage
b.certCountEnabled = atomic2.NewBool(false)
b.publishCertCountMetrics = atomic2.NewBool(false)
b.certsCounted = atomic2.NewBool(false)
b.certCount = new(uint32)
b.revokedCertCount = new(uint32)
b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable"
b.certCount = &atomic.Uint32{}
b.revokedCertCount = &atomic.Uint32{}
b.possibleDoubleCountedSerials = make([]string, 0, 250)
b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250)

@@ -270,9 +273,12 @@ type backend struct {

unifiedTransferStatus *unifiedTransferStatus

certCount *uint32
revokedCertCount *uint32
certCountEnabled *atomic2.Bool
publishCertCountMetrics *atomic2.Bool
certCount *atomic.Uint32
revokedCertCount *atomic.Uint32
certsCounted *atomic2.Bool
certCountError string
possibleDoubleCountedSerials []string
possibleDoubleCountedRevokedSerials []string

@@ -364,9 +370,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque
// Initialize also needs to populate our certificate and revoked certificate count
err = b.initializeStoredCertificateCounts(ctx)
if err != nil {
return err
// Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates.
b.Logger().Error("Could not initialize stored certificate counts", err)
b.certCountError = err.Error()
}

return nil
}

@@ -601,6 +608,13 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
crlErr := doCRL()
tidyErr := doAutoTidy()

// Periodically re-emit gauges so that they don't disappear/go stale
tidyConfig, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
b.emitCertStoreMetrics(tidyConfig)

var errors error
if crlErr != nil {
errors = multierror.Append(errors, fmt.Errorf("Error building CRLs:\n - %w\n", crlErr))
@@ -625,24 +639,51 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
}

func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
// prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in
// tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint)
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
sc := b.makeStorageContext(ctx, b.storage)
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}

b.certCountEnabled.Store(config.MaintainCount)
b.publishCertCountMetrics.Store(config.PublishMetrics)

if config.MaintainCount == false {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
b.certsCounted.Store(true)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
return nil
}

// Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0",
// first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count.
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
// A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics.
b.certsCounted.Store(false)

entries, err := b.storage.List(ctx, "certs/")
if err != nil {
return err
}
atomic.AddUint32(b.certCount, uint32(len(entries)))
b.certCount.Add(uint32(len(entries)))

revokedEntries, err := b.storage.List(ctx, "revoked/")
if err != nil {
return err
}
atomic.AddUint32(b.revokedCertCount, uint32(len(revokedEntries)))
b.revokedCertCount.Add(uint32(len(revokedEntries)))

b.certsCounted.Store(true)
// Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count
@@ -723,64 +764,98 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil

certCount := atomic.LoadUint32(b.certCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
revokedCertCount := atomic.LoadUint32(b.revokedCertCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
b.emitCertStoreMetrics(config)

b.certCountError = ""

return nil
}

func (b *backend) emitCertStoreMetrics(config *tidyConfig) {
if config.PublishMetrics == true {
certCount := b.certCount.Load()
b.emitTotalCertCountMetric(certCount)
revokedCertCount := b.revokedCertCount.Load()
b.emitTotalRevokedCountMetric(revokedCertCount)
}
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) {
certCount := atomic.AddUint32(b.certCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
certCount := b.certCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
}

func (b *backend) decrementTotalCertificatesCountReport() {
certCount := b.decrementTotalCertificatesCountNoReport()
func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() {
if b.certCountEnabled.Load() {
certCount := b.decrementTotalCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
}

func (b *backend) emitTotalCertCountMetric(certCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalCertificatesCountNoReport() uint32 {
newCount := atomic.AddUint32(b.certCount, ^uint32(0))
newCount := b.certCount.Add(^uint32(0))
return newCount
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
newRevokedCertCount := b.revokedCertCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(newRevokedCertCount)
}
}
}
}

func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() {
if b.certCountEnabled.Load() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(revokedCertCount)
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount))
}
}

func (b *backend) decrementTotalRevokedCertificatesCountReport() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0))
newRevokedCertCount := b.revokedCertCount.Add(^uint32(0))
return newRevokedCertCount
}
49 changes: 36 additions & 13 deletions builtin/logical/pki/backend_test.go
Original file line number Diff line number Diff line change
@@ -27,7 +27,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"

@@ -3797,6 +3796,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
t.Fatal(err)
}

// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki/",
})

// Check the metrics initialized in order to calculate backendUUID for /pki
// BackendUUID not consistent during tests with UUID from /sys/mounts/pki
metricsSuffix := "total_certificates_stored"
@@ -3833,6 +3841,14 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
if err != nil {
t.Fatal(err)
}
// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki2/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki2/",
})

// Create a CSR for the intermediate CA
secret, err := client.Logical().Write("pki2/intermediate/generate/internal", nil)
@@ -3961,6 +3977,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
"current_revoked_cert_count": json.Number("0"),
"revocation_queue_deleted_count": json.Number("0"),
"cross_revoked_cert_deleted_count": json.Number("0"),
"internal_backend_uuid": backendUUID,
}
// Let's copy the times from the response so that we can use deep.Equal()
timeStarted, ok := tidyStatus.Data["time_started"]
@@ -5590,6 +5607,14 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
serials[i] = resp.Data["serial_number"].(string)
}

// Turn on certificate counting:
CBWrite(b, s, "config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": false,
})
// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)

// Revoke certificates A + B
revocations := serials[0:2]
for _, key := range revocations {
@@ -5601,18 +5626,16 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
}
}

// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)
if atomic.LoadUint32(b.certCount) != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount))
if b.certCount.Load() != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", b.certCount.Load())
}
if atomic.LoadUint32(b.revokedCertCount) != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", atomic.LoadUint32(b.revokedCertCount))
if b.revokedCertCount.Load() != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", b.revokedCertCount.Load())
}

// Simulates listing while initialize in progress, by "restarting it"
atomic.StoreUint32(b.certCount, 0)
atomic.StoreUint32(b.revokedCertCount, 0)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certsCounted.Store(false)

// Revoke certificates C, D
@@ -5641,12 +5664,12 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
b.initializeStoredCertificateCounts(ctx)

// Test certificate count
if atomic.LoadUint32(b.certCount) != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", *(b.certCount))
if b.certCount.Load() != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", b.certCount.Load())
}

if atomic.LoadUint32(b.revokedCertCount) != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", *(b.revokedCertCount))
if b.revokedCertCount.Load() != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", b.revokedCertCount.Load())
}

return
4 changes: 2 additions & 2 deletions builtin/logical/pki/cert_util.go
Original file line number Diff line number Diff line change
@@ -234,9 +234,9 @@ func fetchCertBySerial(sc *storageContext, prefix, serial string) (*logical.Stor
// If we fail here, we have an extra (copy) of a cert in storage, add to metrics:
switch {
case strings.HasPrefix(prefix, "revoked/"):
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path)
default:
sc.Backend.incrementTotalCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path)
}
return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)}
}
2 changes: 1 addition & 1 deletion builtin/logical/pki/crl_util.go
Original file line number Diff line number Diff line change
@@ -978,7 +978,7 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) (
if err != nil {
return nil, fmt.Errorf("error saving revoked certificate to new location")
}
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)

// If this flag is enabled after the fact, existing local entries will be published to
// the unified storage space through a periodic function.
17 changes: 17 additions & 0 deletions builtin/logical/pki/fields.go
Original file line number Diff line number Diff line change
@@ -518,6 +518,23 @@ greater period of time. By default this is zero seconds.`,
Default: "0s",
}

fields["maintain_stored_certificate_counts"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `This configures whether stored certificates
are counted upon initialization of the backend, and whether during
normal operation, a running count of certificates stored is maintained.`,
Default: false,
}

fields["publish_stored_certificate_count_metrics"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `This configures whether the stored certificate
count is published to the metrics consumer. It does not affect if the
stored certificate count is maintained, and if maintained, it will be
available on the tidy-status endpoint.`,
Default: false,
}

fields["tidy_revocation_queue"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `Set to true to remove stale revocation queue entries
2 changes: 1 addition & 1 deletion builtin/logical/pki/path_issue_sign.go
Original file line number Diff line number Diff line change
@@ -419,7 +419,7 @@ func (b *backend) pathIssueSignCert(ctx context.Context, req *logical.Request, d
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)
}

if useCSR {
4 changes: 2 additions & 2 deletions builtin/logical/pki/path_root.go
Original file line number Diff line number Diff line change
@@ -271,7 +271,7 @@ func (b *backend) pathCAGenerateRoot(ctx context.Context, req *logical.Request,
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)

// Build a fresh CRL
err = b.crlBuilder.rebuild(sc, true)
@@ -468,7 +468,7 @@ func (b *backend) pathIssuerSignIntermediate(ctx context.Context, req *logical.R
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)

if parsedBundle.Certificate.MaxPathLen == 0 {
resp.AddWarning("Max path length of the signed certificate is zero. This certificate cannot be used to issue intermediate CA certificates.")
70 changes: 47 additions & 23 deletions builtin/logical/pki/path_tidy.go
Original file line number Diff line number Diff line change
@@ -70,6 +70,8 @@ type tidyConfig struct {
SafetyBuffer time.Duration `json:"safety_buffer"`
IssuerSafetyBuffer time.Duration `json:"issuer_safety_buffer"`
PauseDuration time.Duration `json:"pause_duration"`
MaintainCount bool `json:"maintain_stored_certificate_counts"`
PublishMetrics bool `json:"publish_stored_certificate_count_metrics"`
RevocationQueue bool `json:"tidy_revocation_queue"`
QueueSafetyBuffer time.Duration `json:"revocation_queue_safety_buffer"`
CrossRevokedCerts bool `json:"tidy_cross_cluster_revoked_certs"`
@@ -86,6 +88,8 @@ var defaultTidyConfig = tidyConfig{
SafetyBuffer: 72 * time.Hour,
IssuerSafetyBuffer: 365 * 24 * time.Hour,
PauseDuration: 0 * time.Second,
MaintainCount: false,
PublishMetrics: false,
RevocationQueue: false,
QueueSafetyBuffer: 48 * time.Hour,
CrossRevokedCerts: false,
@@ -973,11 +977,26 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
"missing_issuer_cert_count": nil,
"current_cert_store_count": nil,
"current_revoked_cert_count": nil,
"internal_backend_uuid": nil,
"revocation_queue_deleted_count": nil,
"cross_revoked_cert_deleted_count": nil,
},
}

resp.Data["internal_backend_uuid"] = b.backendUUID

if b.certCountEnabled.Load() {
resp.Data["current_cert_store_count"] = b.certCount.Load()
resp.Data["current_revoked_cert_count"] = b.revokedCertCount.Load()
if !b.certsCounted.Load() {
resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " +
"inaccurate")
}
if b.certCountError != "" {
resp.Data["certificate_counting_error"] = b.certCountError
}
}

if b.tidyStatus.state == tidyStatusInactive {
return resp, nil
}
@@ -1020,14 +1039,6 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp.Data["time_finished"] = b.tidyStatus.timeFinished
}

resp.Data["current_cert_store_count"] = atomic.LoadUint32(b.certCount)
resp.Data["current_revoked_cert_count"] = atomic.LoadUint32(b.revokedCertCount)

if !b.certsCounted.Load() {
resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " +
"inaccurate")
}

return resp, nil
}

@@ -1040,19 +1051,21 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque

return &logical.Response{
Data: map[string]interface{}{
"enabled": config.Enabled,
"interval_duration": int(config.Interval / time.Second),
"tidy_cert_store": config.CertStore,
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"tidy_expired_issuers": config.ExpiredIssuers,
"tidy_move_legacy_ca_bundle": config.BackupBundle,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
"tidy_revocation_queue": config.RevocationQueue,
"revocation_queue_safety_buffer": int(config.QueueSafetyBuffer / time.Second),
"tidy_cross_cluster_revoked_certs": config.CrossRevokedCerts,
"enabled": config.Enabled,
"interval_duration": int(config.Interval / time.Second),
"tidy_cert_store": config.CertStore,
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"tidy_expired_issuers": config.ExpiredIssuers,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
"publish_stored_certificate_count_metrics": config.PublishMetrics,
"maintain_stored_certificate_counts": config.MaintainCount,
"tidy_move_legacy_ca_bundle": config.BackupBundle,
"tidy_revocation_queue": config.RevocationQueue,
"revocation_queue_safety_buffer": int(config.QueueSafetyBuffer / time.Second),
"tidy_cross_cluster_revoked_certs": config.CrossRevokedCerts,
},
}, nil
}
@@ -1139,6 +1152,17 @@ func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Requ
return logical.ErrorResponse("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations / tidy_expired_issuers / tidy_move_legacy_ca_bundle / tidy_revocation_queue / tidy_cross_cluster_revoked_certs)."), nil
}

if maintainCountEnabledRaw, ok := d.GetOk("maintain_stored_certificate_counts"); ok {
config.MaintainCount = maintainCountEnabledRaw.(bool)
}

if runningStorageMetricsEnabledRaw, ok := d.GetOk("publish_stored_certificate_count_metrics"); ok {
if config.MaintainCount == false {
return logical.ErrorResponse("Can not publish a running storage metrics count to metrics without first maintaining that count. Enable `maintain_stored_certificate_counts` to enable `publish_stored_certificate_count_metrics."), nil
}
config.PublishMetrics = runningStorageMetricsEnabledRaw.(bool)
}

if err := sc.writeAutoTidyConfig(config); err != nil {
return nil, err
}
@@ -1224,7 +1248,7 @@ func (b *backend) tidyStatusIncCertStoreCount() {

b.tidyStatus.certStoreDeletedCount++

b.decrementTotalCertificatesCountReport()
b.ifCountEnabledDecrementTotalCertificatesCountReport()
}

func (b *backend) tidyStatusIncRevokedCertCount() {
@@ -1233,7 +1257,7 @@ func (b *backend) tidyStatusIncRevokedCertCount() {

b.tidyStatus.revokedCertDeletedCount++

b.decrementTotalRevokedCertificatesCountReport()
b.ifCountEnabledDecrementTotalRevokedCertificatesCountReport()
}

func (b *backend) tidyStatusIncMissingIssuerCertCount() {
341 changes: 341 additions & 0 deletions builtin/logical/pki/path_tidy_test.go
Original file line number Diff line number Diff line change
@@ -2,9 +2,15 @@ package pki

import (
"encoding/json"
"errors"
"fmt"
"testing"
"time"

"github.com/hashicorp/vault/helper/testhelpers"

"github.com/armon/go-metrics"

"github.com/hashicorp/vault/api"
vaulthttp "github.com/hashicorp/vault/http"
"github.com/hashicorp/vault/sdk/logical"
@@ -405,3 +411,338 @@ func TestTidyIssuerConfig(t *testing.T) {
require.Equal(t, true, resp.Data["tidy_expired_issuers"])
require.Equal(t, 5, resp.Data["issuer_safety_buffer"])
}

// TestCertStorageMetrics ensures that when enabled, metrics are able to count the number of certificates in storage and
// number of revoked certificates in storage. Moreover, this test ensures that the gauge is emitted periodically, so
// that the metric does not disappear or go stale.
func TestCertStorageMetrics(t *testing.T) {
// This tests uses the same setup as TestAutoTidy
newPeriod := 1 * time.Second

// We set up a metrics accumulator
inmemSink := metrics.NewInmemSink(
2*newPeriod, // A short time period is ideal here to test metrics are emitted every periodic func
2000000*time.Hour)

metricsConf := metrics.DefaultConfig("")
metricsConf.EnableHostname = false
metricsConf.EnableHostnameLabel = false
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false

_, err := metrics.NewGlobal(metricsConf, inmemSink)
if err != nil {
t.Fatal(err)
}

// This test requires the periodicFunc to trigger, which requires we stand
// up a full test cluster.
coreConfig := &vault.CoreConfig{
LogicalBackends: map[string]logical.Factory{
"pki": Factory,
},
// See notes below about usage of /sys/raw for reading cluster
// storage without barrier encryption.
EnableRaw: true,
RollbackPeriod: newPeriod,
}
cluster := vault.NewTestCluster(t, coreConfig, &vault.TestClusterOptions{
HandlerFunc: vaulthttp.Handler,
})
cluster.Start()
defer cluster.Cleanup()
client := cluster.Cores[0].Client

// Mount PKI
err = client.Sys().Mount("pki", &api.MountInput{
Type: "pki",
Config: api.MountConfigInput{
DefaultLeaseTTL: "10m",
MaxLeaseTTL: "60m",
},
})
require.NoError(t, err)

// Generate root.
resp, err := client.Logical().Write("pki/root/generate/internal", map[string]interface{}{
"ttl": "40h",
"common_name": "Root X1",
"key_type": "ec",
})
require.NoError(t, err)
require.NotNil(t, resp)
require.NotEmpty(t, resp.Data)
require.NotEmpty(t, resp.Data["issuer_id"])

// Set up a testing role.
_, err = client.Logical().Write("pki/roles/local-testing", map[string]interface{}{
"allow_any_name": true,
"enforce_hostnames": false,
"key_type": "ec",
})
require.NoError(t, err)

// Run tidy so that tidy-status is not empty
_, err = client.Logical().Write("pki/tidy", map[string]interface{}{
"tidy_revoked_certs": true,
})
require.NoError(t, err)

// Since certificate counts are off by default, we shouldn't see counts in the tidy status
tidyStatus, err := client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
// backendUUID should exist, we need this for metrics
backendUUID := tidyStatus.Data["internal_backend_uuid"].(string)
// "current_cert_store_count", "current_revoked_cert_count"
countData, ok := tidyStatus.Data["current_cert_store_count"]
if ok && countData != nil {
t.Fatalf("Certificate counting should be off by default, but current cert store count %v appeared in tidy status in unconfigured mount", countData)
}
revokedCountData, ok := tidyStatus.Data["current_revoked_cert_count"]
if ok && revokedCountData != nil {
t.Fatalf("Certificate counting should be off by default, but revoked cert count %v appeared in tidy status in unconfigured mount", revokedCountData)
}

// Since certificate counts are off by default, those metrics should not exist yet
mostRecentInterval := inmemSink.Data()[len(inmemSink.Data())-1]
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount")
}
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount")
}

// Write the auto-tidy config.
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"enabled": true,
"interval_duration": "1s",
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"safety_buffer": "1s",
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": false,
})
require.NoError(t, err)

// Reload the Mount - Otherwise Stored Certificate Counts Will Not Be Populated
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"plugin": "pki",
})

// By reading the auto-tidy endpoint, we ensure that initialize has completed (which has a write lock on auto-tidy)
_, err = client.Logical().Read("/pki/config/auto-tidy")
if err != nil {
t.Fatal(err)
}

// Since publish_stored_certificate_count_metrics is still false, these metrics should still not exist yet
mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1]
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount")
}
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount")
}

// But since certificate counting is on, the metrics should exist on tidyStatus endpoint:
tidyStatus, err = client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
// backendUUID should exist, we need this for metrics
backendUUID = tidyStatus.Data["internal_backend_uuid"].(string)
// "current_cert_store_count", "current_revoked_cert_count"
certStoreCount, ok := tidyStatus.Data["current_cert_store_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status")
}
if certStoreCount != json.Number("1") {
t.Fatalf("Only created one certificate, but a got a certificate count of %v", certStoreCount)
}
revokedCertCount, ok := tidyStatus.Data["current_revoked_cert_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status")
}
if revokedCertCount != json.Number("0") {
t.Fatalf("Have not yet revoked a certificate, but got a revoked cert store count of %v", revokedCertCount)
}

// Write the auto-tidy config, again, this time turning on metrics
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"enabled": true,
"interval_duration": "1s",
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"safety_buffer": "1s",
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
require.NoError(t, err)

// Issue a cert and revoke it.
resp, err = client.Logical().Write("pki/issue/local-testing", map[string]interface{}{
"common_name": "example.com",
"ttl": "10s",
})
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Data)
require.NotEmpty(t, resp.Data["serial_number"])
require.NotEmpty(t, resp.Data["certificate"])
leafSerial := resp.Data["serial_number"].(string)
leafCert := parseCert(t, resp.Data["certificate"].(string))

// Read cert before revoking
resp, err = client.Logical().Read("pki/cert/" + leafSerial)
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Data)
require.NotEmpty(t, resp.Data["certificate"])
revocationTime, err := (resp.Data["revocation_time"].(json.Number)).Int64()
require.Equal(t, int64(0), revocationTime, "revocation time was not zero")
require.Empty(t, resp.Data["revocation_time_rfc3339"], "revocation_time_rfc3339 was not empty")
require.Empty(t, resp.Data["issuer_id"], "issuer_id was not empty")

_, err = client.Logical().Write("pki/revoke", map[string]interface{}{
"serial_number": leafSerial,
})
require.NoError(t, err)

// We read the auto-tidy endpoint again, to ensure any metrics logic has completed (lock on config)
_, err = client.Logical().Read("/pki/config/auto-tidy")
if err != nil {
t.Fatal(err)
}

// Check Metrics After Cert Has Be Created and Revoked
tidyStatus, err = client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
backendUUID = tidyStatus.Data["internal_backend_uuid"].(string)
certStoreCount, ok = tidyStatus.Data["current_cert_store_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status")
}
if certStoreCount != json.Number("2") {
t.Fatalf("Created root and leaf certificate, but a got a certificate count of %v", certStoreCount)
}
revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status")
}
if revokedCertCount != json.Number("1") {
t.Fatalf("Revoked one certificate, but got a revoked cert store count of %v", revokedCertCount)
}
// This should now be initialized
certCountError, ok := tidyStatus.Data["certificate_counting_error"]
if ok && certCountError.(string) != "" {
t.Fatalf("Expected certificate count error to disappear after initialization, but got error %v", certCountError)
}

testhelpers.RetryUntil(t, newPeriod*5, func() error {
mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1]
revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but revoked cert count was not emitted")
}
if revokedCertCountGaugeValue.Value != 1 {
return fmt.Errorf("revoked one certificate, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue)
}
certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but total certificate count was not emitted")
}
if certStoreCountGaugeValue.Value != 2 {
return fmt.Errorf("stored two certificiates, but total certificate count emitted was %v", certStoreCountGaugeValue.Value)
}
return nil
})

// Wait for cert to expire and the safety buffer to elapse.
time.Sleep(time.Until(leafCert.NotAfter) + 3*time.Second)

// Wait for auto-tidy to run afterwards.
var foundTidyRunning string
var foundTidyFinished bool
timeoutChan := time.After(120 * time.Second)
for {
if foundTidyRunning != "" && foundTidyFinished {
break
}

select {
case <-timeoutChan:
t.Fatalf("expected auto-tidy to run (%v) and finish (%v) before 120 seconds elapsed", foundTidyRunning, foundTidyFinished)
default:
time.Sleep(250 * time.Millisecond)

resp, err = client.Logical().Read("pki/tidy-status")
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Data)
require.NotEmpty(t, resp.Data["state"])
require.NotEmpty(t, resp.Data["time_started"])
state := resp.Data["state"].(string)
started := resp.Data["time_started"].(string)
t.Logf("Resp: %v", resp.Data)

// We want the _next_ tidy run after the cert expires. This
// means if we're currently finished when we hit this the
// first time, we want to wait for the next run.
if foundTidyRunning == "" {
foundTidyRunning = started
} else if foundTidyRunning != started && !foundTidyFinished && state == "Finished" {
foundTidyFinished = true
}
}
}

// After Tidy, Cert Store Count Should Still Be Available, and Be Updated:
// Check Metrics After Cert Has Be Created and Revoked
tidyStatus, err = client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
backendUUID = tidyStatus.Data["internal_backend_uuid"].(string)
// "current_cert_store_count", "current_revoked_cert_count"
certStoreCount, ok = tidyStatus.Data["current_cert_store_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status")
}
if certStoreCount != json.Number("1") {
t.Fatalf("Created root and leaf certificate, deleted leaf, but a got a certificate count of %v", certStoreCount)
}
revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status")
}
if revokedCertCount != json.Number("0") {
t.Fatalf("Revoked certificate has been tidied, but got a revoked cert store count of %v", revokedCertCount)
}

testhelpers.RetryUntil(t, newPeriod*5, func() error {
mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1]
revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but revoked cert count was not emitted")
}
if revokedCertCountGaugeValue.Value != 0 {
return fmt.Errorf("revoked certificate has been tidied, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue)
}
certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but total certificate count was not emitted")
}
if certStoreCountGaugeValue.Value != 1 {
return fmt.Errorf("only one of two certificates left after tidy, but total certificate count emitted was %v", certStoreCountGaugeValue.Value)
}
return nil
})
}
28 changes: 27 additions & 1 deletion builtin/logical/pki/storage.go
Original file line number Diff line number Diff line change
@@ -1351,7 +1351,33 @@ func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error {
return err
}

return sc.Storage.Put(sc.Context, entry)
err = sc.Storage.Put(sc.Context, entry)
if err != nil {
return err
}

sc.Backend.publishCertCountMetrics.Store(config.PublishMetrics)

// To Potentially Disable Certificate Counting
if config.MaintainCount == false {
certCountWasEnabled := sc.Backend.certCountEnabled.Swap(config.MaintainCount)
if certCountWasEnabled {
sc.Backend.certsCounted.Store(true)
sc.Backend.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
sc.Backend.possibleDoubleCountedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize
sc.Backend.possibleDoubleCountedRevokedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize
sc.Backend.certCount.Store(0)
sc.Backend.revokedCertCount.Store(0)
}
} else { // To Potentially Enable Certificate Counting
if sc.Backend.certCountEnabled.Load() == false {
// We haven't written "re-enable certificate counts" outside the initialize function
// Any call derived call to do so is likely to time out on ~2 million certs
sc.Backend.certCountError = "Certificate Counting Has Not Been Initialized, re-initialize this mount"
}
}

return nil
}

func (sc *storageContext) listRevokedCerts() ([]string, error) {
6 changes: 6 additions & 0 deletions changelog/18186.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
```release-note:breaking-change
secrets/pki: Maintaining running count of certificates will be turned off by default.
To re-enable keeping these metrics available on the tidy status endpoint, enable
maintain_stored_certificate_counts on tidy-config, to also publish them to the
metrics consumer, enable publish_stored_certificate_count_metrics .
```