From 1979d7872f4f4584209921c35d9279397cde0e2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Fern=C3=A1ndez=20Haro?=
 <alejandro.haro@elastic.co>
Date: Fri, 18 Sep 2020 19:01:18 +0100
Subject: [PATCH] Cherry-pick to 7.9: [Metricbeat][Kibana] Apply backoff when
 errored at getting usage stats (#20772) (#21162)

Co-authored-by: Shaunak Kashyap <ycombinator@gmail.com>

Co-authored-by: Shaunak Kashyap <ycombinator@gmail.com>
---
 CHANGELOG.next.asciidoc                      |  1 +
 metricbeat/module/kibana/stats/stats.go      | 14 +++++--
 metricbeat/module/kibana/stats/stats_test.go | 44 ++++++++++++++++++--
 3 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
index 0b799cb12be3..1d5cafcb155b 100644
--- a/CHANGELOG.next.asciidoc
+++ b/CHANGELOG.next.asciidoc
@@ -101,6 +101,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
 - Fix storage metricset to allow config without region/zone. {issue}17623[17623] {pull}17624[17624]
 - Fix overflow on Prometheus rates when new buckets are added on the go. {pull}17753[17753]
 - Add a switch to the driver definition on SQL module to use pretty names {pull}17378[17378]
+- The Kibana collector applies backoff when errored at getting usage stats {pull}20772[20772]
 - The `elasticsearch/index` metricset only requests wildcard expansion for hidden indices if the monitored Elasticsearch cluster supports it. {pull}20938[20938]
 - Fix panic index out of range error when getting AWS account name. {pull}21101[21101] {issue}21095[21095]
 - Handle missing counters in the application_pool metricset. {pull}21071[21071]
diff --git a/metricbeat/module/kibana/stats/stats.go b/metricbeat/module/kibana/stats/stats.go
index 526a525951c2..a6e19d50f42c 100644
--- a/metricbeat/module/kibana/stats/stats.go
+++ b/metricbeat/module/kibana/stats/stats.go
@@ -38,9 +38,10 @@ func init() {
 }
 
 const (
-	statsPath             = "api/stats"
-	settingsPath          = "api/settings"
-	usageCollectionPeriod = 24 * time.Hour
+	statsPath              = "api/stats"
+	settingsPath           = "api/settings"
+	usageCollectionPeriod  = 24 * time.Hour
+	usageCollectionBackoff = 1 * time.Hour
 )
 
 var (
@@ -57,6 +58,7 @@ type MetricSet struct {
 	statsHTTP            *helper.HTTP
 	settingsHTTP         *helper.HTTP
 	usageLastCollectedOn time.Time
+	usageNextCollectOn   time.Time
 	isUsageExcludable    bool
 }
 
@@ -165,6 +167,10 @@ func (m *MetricSet) fetchStats(r mb.ReporterV2, now time.Time) error {
 
 		content, err = m.statsHTTP.FetchContent()
 		if err != nil {
+			if shouldCollectUsage {
+				// When errored in collecting the usage stats it may be counterproductive to try again on the next poll, try to collect the stats again after usageCollectionBackoff
+				m.usageNextCollectOn = now.Add(usageCollectionBackoff)
+			}
 			return err
 		}
 
@@ -215,5 +221,5 @@ func (m *MetricSet) calculateIntervalMs() int64 {
 }
 
 func (m *MetricSet) shouldCollectUsage(now time.Time) bool {
-	return now.Sub(m.usageLastCollectedOn) > usageCollectionPeriod
+	return now.Sub(m.usageLastCollectedOn) > usageCollectionPeriod && now.Sub(m.usageNextCollectOn) > 0
 }
diff --git a/metricbeat/module/kibana/stats/stats_test.go b/metricbeat/module/kibana/stats/stats_test.go
index b6757e5ecb21..56cbfc17e1f2 100644
--- a/metricbeat/module/kibana/stats/stats_test.go
+++ b/metricbeat/module/kibana/stats/stats_test.go
@@ -23,6 +23,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/require"
 
@@ -48,12 +49,12 @@ func TestFetchUsage(t *testing.T) {
 				w.WriteHeader(503)
 
 			case 1: // second call
-				// Make sure exclude_usage is still false since first call failed
-				require.Equal(t, "false", excludeUsage)
+				// Make sure exclude_usage is true since first call failed and it should not try again until usageCollectionBackoff time has passed
+				require.Equal(t, "true", excludeUsage)
 				w.WriteHeader(200)
 
 			case 2: // third call
-				// Make sure exclude_usage is now true since second call succeeded
+				// Make sure exclude_usage is still true
 				require.Equal(t, "true", excludeUsage)
 				w.WriteHeader(200)
 			}
@@ -76,3 +77,40 @@ func TestFetchUsage(t *testing.T) {
 	// Third fetch
 	mbtest.ReportingFetchV2Error(f)
 }
+
+func TestShouldCollectUsage(t *testing.T) {
+	now := time.Now()
+
+	cases := map[string]struct {
+		usageLastCollectedOn time.Time
+		usageNextCollectOn   time.Time
+		expectedResult       bool
+	}{
+		"within_usage_collection_period": {
+			usageLastCollectedOn: now.Add(-1 * usageCollectionPeriod),
+			expectedResult:       false,
+		},
+		"after_usage_collection_period_but_before_next_scheduled_collection": {
+			usageLastCollectedOn: now.Add(-2 * usageCollectionPeriod),
+			usageNextCollectOn:   now.Add(3 * time.Hour),
+			expectedResult:       false,
+		},
+		"after_usage_collection_period_and_after_next_scheduled_collection": {
+			usageLastCollectedOn: now.Add(-2 * usageCollectionPeriod),
+			usageNextCollectOn:   now.Add(-1 * time.Hour),
+			expectedResult:       true,
+		},
+	}
+
+	for name, test := range cases {
+		t.Run(name, func(t *testing.T) {
+			m := MetricSet{
+				usageLastCollectedOn: test.usageLastCollectedOn,
+				usageNextCollectOn:   test.usageNextCollectOn,
+			}
+
+			actualResult := m.shouldCollectUsage(now)
+			require.Equal(t, test.expectedResult, actualResult)
+		})
+	}
+}