-
Notifications
You must be signed in to change notification settings - Fork 4.7k
/
alerts.go
349 lines (299 loc) · 11.3 KB
/
alerts.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
package alertanalyzer
import (
"context"
"fmt"
"math"
"sort"
"strings"
"time"
routeclient "github.com/openshift/client-go/route/clientset/versioned"
"github.com/openshift/library-go/test/library/metrics"
"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestlibrary/prometheus"
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
prometheustypes "github.com/prometheus/common/model"
"github.com/sirupsen/logrus"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
func fetchEventIntervalsForAllAlerts(ctx context.Context, restConfig *rest.Config, startTime time.Time) ([]monitorapi.Interval, error) {
kubeClient, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return nil, err
}
routeClient, err := routeclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}
_, err = kubeClient.CoreV1().Namespaces().Get(ctx, "openshift-monitoring", metav1.GetOptions{})
if apierrors.IsNotFound(err) {
return []monitorapi.Interval{}, nil
}
prometheusClient, err := metrics.NewPrometheusClient(ctx, kubeClient, routeClient)
if err != nil {
return nil, err
}
intervals, err := prometheus.EnsureThanosQueriersConnectedToPromSidecars(ctx, prometheusClient)
if err != nil {
return intervals, err
}
timeRange := prometheusv1.Range{
Start: startTime,
End: time.Now(),
Step: 2 * time.Second,
}
alerts, warningsForQuery, err := prometheusClient.QueryRange(ctx, `ALERTS{alertstate="firing"}`, timeRange)
if err != nil {
return nil, err
}
if len(warningsForQuery) > 0 {
fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t"))
}
firingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime)
if err != nil {
return nil, err
}
alerts, warningsForQuery, err = prometheusClient.QueryRange(ctx, `ALERTS{alertstate="pending"}`, timeRange)
if err != nil {
return nil, err
}
if len(warningsForQuery) > 0 {
fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t"))
}
pendingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime)
if err != nil {
return nil, err
}
// firing alerts trump pending alerts, so if the alerts will overlap when we render, then we want to have pending
// broken up by firing, so the alert should not be listed as pending at the same time as it is firing in our intervals.
pendingAlerts = blackoutEvents(pendingAlerts, firingAlerts)
ret := []monitorapi.Interval{}
ret = append(ret, firingAlerts...)
ret = append(ret, pendingAlerts...)
return ret, nil
}
// blackoutEvents filters startingEvents and rewrites into potentially multiple events to avoid overlap with the blackoutWindows.
// For instance, if startingEvents for locator/foo covers 1:00-1:45 and blackoutWindows for locator/Foo covers 1:10-1:15 and 1:40-1:50
// the return has locator/foo 1:00-1:10, 1:15-1:40.
func blackoutEvents(startingEvents, blackoutWindows []monitorapi.Interval) []monitorapi.Interval {
ret := []monitorapi.Interval{}
blackoutsByLocator := indexByLocator(blackoutWindows)
for i := range startingEvents {
startingEvent := startingEvents[i]
blackouts := blackoutsByLocator[startingEvent.Locator.OldLocator()]
if len(blackouts) == 0 {
ret = append(ret, startingEvent)
continue
}
relatedBlackouts := nonOverlappingBlackoutWindowsFromEvents(blackouts)
currStartTime := startingEvent.From
maxEndTime := startingEvent.To
for i, currBlackout := range relatedBlackouts {
if currBlackout.To.Before(currStartTime) { // too early, does not apply
continue
}
if currBlackout.From.After(maxEndTime) { // too late, does not apply and we're done
break
}
var nextBlackout *blackoutWindow
if nextIndex := i + 1; nextIndex < len(relatedBlackouts) {
nextBlackout = &relatedBlackouts[nextIndex]
}
switch {
case currBlackout.From.Before(currStartTime) || currBlackout.From == currStartTime:
// if the blackoutEvent is before the currentStartTime, then the new startTime will be when this blackout ends
eventNext := startingEvent
eventNext.From = currBlackout.To
if nextBlackout != nil && nextBlackout.From.Before(maxEndTime) {
eventNext.To = nextBlackout.From
} else {
eventNext.To = maxEndTime
}
currStartTime = eventNext.To
if eventNext.From != eventNext.To && eventNext.From.Before(eventNext.To) {
ret = append(ret, eventNext)
}
// if we're at the end of the blackout list
if nextBlackout == nil {
eventNext = startingEvent
eventNext.From = currStartTime
eventNext.To = maxEndTime
currStartTime = eventNext.To
if eventNext.From != eventNext.To && eventNext.From.Before(eventNext.To) {
ret = append(ret, eventNext)
}
}
case currBlackout.To.After(maxEndTime) || currBlackout.To == maxEndTime:
// this should be the last blackout that applies to us, because all the other ones will start *after* this .To
// if the blackoutEvent ends after the maxEndTime, then the new maxEndTime will be when this blackout starts
eventNext := startingEvent
eventNext.From = currStartTime
eventNext.To = currBlackout.From
currStartTime = eventNext.To
if eventNext.From != eventNext.To && eventNext.From.Before(eventNext.To) {
ret = append(ret, eventNext)
}
default:
// if we're here, then the blackout is in the middle of our overall timeframe
eventNext := startingEvent
eventNext.From = currStartTime
eventNext.To = currBlackout.From
currStartTime = currBlackout.To
if eventNext.From != eventNext.To && eventNext.From.Before(eventNext.To) {
ret = append(ret, eventNext)
}
// if we're at the end of the blackout list
if nextBlackout == nil {
eventNext = startingEvent
eventNext.From = currStartTime
eventNext.To = maxEndTime
currStartTime = eventNext.To
if eventNext.From != eventNext.To && eventNext.From.Before(eventNext.To) {
ret = append(ret, eventNext)
}
}
}
// we're done
if !currStartTime.Before(maxEndTime) {
break
}
}
}
sort.Sort(monitorapi.Intervals(ret))
return ret
}
type blackoutWindow struct {
From time.Time
To time.Time
}
func nonOverlappingBlackoutWindowsFromEvents(blackoutWindows []monitorapi.Interval) []blackoutWindow {
sort.Sort(monitorapi.Intervals(blackoutWindows))
ret := []blackoutWindow{}
for _, sourceWindow := range blackoutWindows {
if len(ret) == 0 {
ret = append(ret, blackoutWindow{
From: sourceWindow.From,
To: sourceWindow.To,
})
continue
}
newRet := make([]blackoutWindow, len(ret))
copy(newRet, ret)
for j := range ret {
resultWindow := ret[j]
switch {
case sourceWindow.From.After(resultWindow.From) && sourceWindow.To.Before(resultWindow.To):
// strictly smaller, the source window can be ignored
case sourceWindow.From.After(resultWindow.To):
// too late, does not overlap add the source
newRet = append(newRet, blackoutWindow{
From: sourceWindow.From,
To: sourceWindow.To,
})
case sourceWindow.To.Before(resultWindow.From):
// too early, does not overlap
newRet = append(newRet, blackoutWindow{
From: sourceWindow.From,
To: sourceWindow.To,
})
case sourceWindow.From.Before(resultWindow.From) && sourceWindow.To.After(resultWindow.To):
// strictly larger, the new source window times should overwrite
resultWindow.From = sourceWindow.From
resultWindow.To = sourceWindow.To
newRet[j] = resultWindow
case sourceWindow.From.Before(resultWindow.From):
// the sourceWindow starts before the resultWindow and to is somewhere during, the window should start earlier
resultWindow.From = sourceWindow.From
newRet[j] = resultWindow
case sourceWindow.To.After(resultWindow.To):
// the sourceWindow ends after the resultWindow and from is somewhere during, the window should end later
resultWindow.To = sourceWindow.To
newRet[j] = resultWindow
default:
// let's hope we don't do anything here
}
}
ret = newRet
}
return ret
}
func indexByLocator(events []monitorapi.Interval) map[string][]monitorapi.Interval {
ret := map[string][]monitorapi.Interval{}
for i := range events {
event := events[i]
ret[event.Locator.OldLocator()] = append(ret[event.Locator.OldLocator()], event)
}
return ret
}
func createEventIntervalsForAlerts(ctx context.Context, alerts prometheustypes.Value, startTime time.Time) ([]monitorapi.Interval, error) {
ret := []monitorapi.Interval{}
switch {
case alerts.Type() == prometheustypes.ValMatrix:
matrixAlert := alerts.(prometheustypes.Matrix)
for _, alert := range matrixAlert {
lb := monitorapi.NewLocator().AlertFromPromSampleStream(alert)
var level monitorapi.IntervalLevel
switch {
// as I understand it, pending alerts are cases where the conditions except for "how long has been happening"
// are all met. Pending alerts include what level the eventual alert will be, but they are not errors in and
// of themselves. They are you useful to show in time to find patterns of "X fails concurrent with Y"
case alert.Metric["alertstate"] == "pending":
level = monitorapi.Info
case alert.Metric["severity"] == "warning":
level = monitorapi.Warning
case alert.Metric["severity"] == "critical":
level = monitorapi.Error
case alert.Metric["severity"] == "info": // this case may not exist
level = monitorapi.Info
default:
level = monitorapi.Error
}
msg := monitorapi.NewMessage().HumanMessage(alert.Metric.String())
if len(string(alert.Metric["alertstate"])) > 0 {
msg = msg.WithAnnotation(monitorapi.AnnotationAlertState, string(alert.Metric["alertstate"]))
}
if len(string(alert.Metric["severity"])) > 0 {
msg = msg.WithAnnotation(monitorapi.AnnotationSeverity, string(alert.Metric["severity"]))
}
alertIntervalTemplate :=
monitorapi.NewInterval(monitorapi.SourceAlert, level).
Locator(lb).
Message(msg)
var alertStartTime *time.Time
var lastTime *time.Time
for _, currValue := range alert.Values {
currTime := currValue.Timestamp.Time()
if alertStartTime == nil {
alertStartTime = &currTime
}
if lastTime == nil {
lastTime = &currTime
}
// if it has been less than five seconds since we saw this, consider it the same interval and check
// the next time.
if math.Abs(currTime.Sub(*lastTime).Seconds()) < (5 * time.Second).Seconds() {
lastTime = &currTime
continue
}
// if it has been more than five seconds, consider this the start of a new occurrence and add the interval
ret = append(ret, alertIntervalTemplate.Build(*alertStartTime, *lastTime))
// now reset the tracking
alertStartTime = &currTime
lastTime = nil
}
// now add the one for the last start time. If we do not have a last time, it means we saw the start, but not
// the end. We don't know when this alert ended, but our threshold time from above is five seconds so we will
// simply assign that here as "better than nothing"
if lastTime == nil {
t := alertStartTime.Add(5 * time.Second)
lastTime = &t
}
ret = append(ret, alertIntervalTemplate.Build(*alertStartTime, *lastTime))
}
default:
logrus.WithField("type", alerts.Type()).Warning("unhandled prometheus alert type received in alert monitor")
}
return ret, nil
}