-
Notifications
You must be signed in to change notification settings - Fork 14.5k
/
Copy pathotel_logger.py
425 lines (347 loc) · 15.6 KB
/
otel_logger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
import datetime
import logging
import random
import warnings
from functools import partial
from typing import Callable, Iterable, Union
from opentelemetry import metrics
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.metrics import Instrument, Observation
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics._internal.export import ConsoleMetricExporter, PeriodicExportingMetricReader
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.util.types import Attributes
from airflow.configuration import conf
from airflow.metrics.protocols import DeltaType, Timer, TimerProtocol
from airflow.metrics.validators import (
OTEL_NAME_MAX_LENGTH,
AllowListValidator,
stat_name_otel_handler,
)
log = logging.getLogger(__name__)
GaugeValues = Union[int, float]
DEFAULT_GAUGE_VALUE = 0.0
# "airflow.dag_processing.processes" is currently the only UDC used in Airflow. If more are added,
# we should add a better system for this.
#
# Generally in OTel a Counter is monotonic (can only go up) and there is an UpDownCounter which,
# as you can guess, is non-monotonic; it can go up or down. The choice here is to either drop
# this one metric and implement the rest as monotonic Counters, implement all counters as
# UpDownCounters, or add a bit of logic to do it intelligently. The catch is that the Collector
# which transmits these metrics to the upstream dashboard tools (Prometheus, Grafana, etc.) assigns
# the type of Gauge to any UDC instead of Counter. Adding this logic feels like the best compromise
# where normal Counters still get typed correctly, and we don't lose an existing metric.
# See:
# https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/api.md#counter-creation
# https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/api.md#updowncounter
UP_DOWN_COUNTERS = {"airflow.dag_processing.processes"}
DEFAULT_METRIC_NAME_PREFIX = "airflow"
# Delimiter is placed between the universal metric prefix and the unique metric name.
DEFAULT_METRIC_NAME_DELIMITER = "."
def full_name(name: str, *, prefix: str = DEFAULT_METRIC_NAME_PREFIX) -> str:
"""Assembles the prefix, delimiter, and name and returns it as a string."""
return f"{prefix}{DEFAULT_METRIC_NAME_DELIMITER}{name}"
def _is_up_down_counter(name):
return name in UP_DOWN_COUNTERS
def _generate_key_name(name: str, attributes: Attributes = None):
if attributes:
key = name
for item in attributes.items():
key += f"_{item[0]}_{item[1]}"
else:
key = name
return key
def name_is_otel_safe(prefix: str, name: str) -> bool:
"""
Returns True if the provided name and prefix would result in a name that meets the OpenTelemetry standard.
Legal names are defined here:
https://opentelemetry.io/docs/reference/specification/metrics/api/#instrument-name-syntax
"""
return bool(stat_name_otel_handler(prefix, name, max_length=OTEL_NAME_MAX_LENGTH))
def _type_as_str(obj: Instrument) -> str:
"""
Given an OpenTelemetry Instrument, returns the type of the instrument as a string.
:param obj: An OTel Instrument or subclass
:returns: The type() of the Instrument without all the nested class info
"""
# type().__name__ will return something like: '_Counter',
# this drops the leading underscore for cleaner logging.
return type(obj).__name__[1:]
def _get_otel_safe_name(name: str) -> str:
"""
Verifies that the provided name does not exceed OpenTelemetry's maximum length for metric names.
:param name: The original metric name
:returns: The name, truncated to an OTel-acceptable length if required.
"""
otel_safe_name = name[:OTEL_NAME_MAX_LENGTH]
if name != otel_safe_name:
warnings.warn(
f"Metric name `{name}` exceeds OpenTelemetry's name length limit of "
f"{OTEL_NAME_MAX_LENGTH} characters and will be truncated to `{otel_safe_name}`."
)
return otel_safe_name
def _skip_due_to_rate(rate: float) -> bool:
if rate < 0:
raise ValueError("rate must be a positive value.")
return rate < 1 and random.random() > rate
class _OtelTimer(Timer):
"""
An implementation of Stats.Timer() which records the result in the OTel Metrics Map.
OpenTelemetry does not have a native timer, we will store the values as a Gauge.
:param name: The name of the timer.
:param tags: Tags to append to the timer.
"""
def __init__(self, otel_logger: SafeOtelLogger, name: str | None, tags: Attributes):
super().__init__()
self.otel_logger = otel_logger
self.name = name
self.tags = tags
def stop(self, send: bool = True) -> None:
super().stop(send)
if self.name and send:
self.otel_logger.metrics_map.set_gauge_value(
full_name(prefix=self.otel_logger.prefix, name=self.name), self.duration, False, self.tags
)
class SafeOtelLogger:
"""Otel Logger."""
def __init__(
self,
otel_provider,
prefix: str = DEFAULT_METRIC_NAME_PREFIX,
allow_list_validator=AllowListValidator(),
):
self.otel: Callable = otel_provider
self.prefix: str = prefix
self.metrics_validator = allow_list_validator
self.meter = otel_provider.get_meter(__name__)
self.metrics_map = MetricsMap(self.meter)
def incr(
self,
stat: str,
count: int = 1,
rate: float = 1,
tags: Attributes = None,
):
"""
Increment stat by count.
:param stat: The name of the stat to increment.
:param count: A positive integer to add to the current value of stat.
:param rate: value between 0 and 1 that represents the sample rate at
which the metric is going to be emitted.
:param tags: Tags to append to the stat.
"""
if _skip_due_to_rate(rate):
return
if count < 0:
raise ValueError("count must be a positive value.")
if self.metrics_validator.test(stat) and name_is_otel_safe(self.prefix, stat):
counter = self.metrics_map.get_counter(full_name(prefix=self.prefix, name=stat), attributes=tags)
counter.add(count, attributes=tags)
return counter
def decr(
self,
stat: str,
count: int = 1,
rate: float = 1,
tags: Attributes = None,
):
"""
Decrement stat by count.
:param stat: The name of the stat to decrement.
:param count: A positive integer to subtract from current value of stat.
:param rate: value between 0 and 1 that represents the sample rate at
which the metric is going to be emitted.
:param tags: Tags to append to the stat.
"""
if _skip_due_to_rate(rate):
return
if count < 0:
raise ValueError("count must be a positive value.")
if self.metrics_validator.test(stat) and name_is_otel_safe(self.prefix, stat):
counter = self.metrics_map.get_counter(full_name(prefix=self.prefix, name=stat))
counter.add(-count, attributes=tags)
return counter
def gauge(
self,
stat: str,
value: int | float,
rate: float = 1,
delta: bool = False,
*,
tags: Attributes = None,
back_compat_name: str = "",
) -> None:
"""
Record a new value for a Gauge.
:param stat: The name of the stat to update.
:param value: The new value of stat, either a float or an int.
:param rate: value between 0 and 1 that represents the sample rate at
which the metric is going to be emitted.
:param delta: If true, the provided value will be added to the previous value.
If False the new value will override the previous.
:param tags: Tags to append to the stat.
:param back_compat_name: If an alternative name is provided, the
stat will be emitted using both names if possible.
"""
if _skip_due_to_rate(rate):
return
if back_compat_name and self.metrics_validator.test(back_compat_name):
self.metrics_map.set_gauge_value(
full_name(prefix=self.prefix, name=back_compat_name), value, delta, tags
)
if self.metrics_validator.test(stat):
self.metrics_map.set_gauge_value(full_name(prefix=self.prefix, name=stat), value, delta, tags)
def timing(
self,
stat: str,
dt: DeltaType,
*,
tags: Attributes = None,
) -> None:
"""OTel does not have a native timer, stored as a Gauge whose value is number of seconds elapsed."""
if self.metrics_validator.test(stat) and name_is_otel_safe(self.prefix, stat):
if isinstance(dt, datetime.timedelta):
dt = dt.total_seconds()
self.metrics_map.set_gauge_value(full_name(prefix=self.prefix, name=stat), float(dt), False, tags)
def timer(
self,
stat: str | None = None,
*args,
tags: Attributes = None,
**kwargs,
) -> TimerProtocol:
"""Timer context manager returns the duration and can be cancelled."""
return _OtelTimer(self, stat, tags)
class MetricsMap:
"""Stores Otel Instruments."""
def __init__(self, meter):
self.meter = meter
self.map = {}
def clear(self) -> None:
self.map.clear()
def _create_counter(self, name):
"""Creates a new counter or up_down_counter for the provided name."""
otel_safe_name = _get_otel_safe_name(name)
if _is_up_down_counter(name):
counter = self.meter.create_up_down_counter(name=otel_safe_name)
else:
counter = self.meter.create_counter(name=otel_safe_name)
logging.debug("Created %s as type: %s", otel_safe_name, _type_as_str(counter))
return counter
def get_counter(self, name: str, attributes: Attributes = None):
"""
Returns the counter; creates a new one if it did not exist.
:param name: The name of the counter to fetch or create.
:param attributes: Counter attributes, used to generate a unique key to store the counter.
"""
key = _generate_key_name(name, attributes)
if key in self.map.keys():
return self.map[key]
else:
new_counter = self._create_counter(name)
self.map[key] = new_counter
return new_counter
def del_counter(self, name: str, attributes: Attributes = None) -> None:
"""
Deletes a counter.
:param name: The name of the counter to delete.
:param attributes: Counter attributes which were used to generate a unique key to store the counter.
"""
key = _generate_key_name(name, attributes)
if key in self.map.keys():
del self.map[key]
def set_gauge_value(self, name: str, value: float | None, delta: bool, tags: Attributes):
"""
Overrides the last reading for a Gauge with a new value.
:param name: The name of the gauge to record.
:param value: The new reading to record.
:param delta: If True, value is added to the previous reading, else it overrides.
:param tags: Gauge attributes which were used to generate a unique key to store the counter.
:returns: None
"""
key: str = _generate_key_name(name, tags)
new_value = value or DEFAULT_GAUGE_VALUE
old_value = self.poke_gauge(name, tags)
if delta:
new_value += old_value
# If delta is true, add the new value to the last reading otherwise overwrite it.
self.map[key] = Observation(new_value, tags)
def _create_gauge(self, name: str, attributes: Attributes = None):
"""
Creates a new Observable Gauge with the provided name and the default value.
:param name: The name of the gauge to fetch or create.
:param attributes: Gauge attributes, used to generate a unique key to store the gauge.
"""
otel_safe_name = _get_otel_safe_name(name)
key = _generate_key_name(name, attributes)
gauge = self.meter.create_observable_gauge(
name=otel_safe_name,
callbacks=[partial(self.read_gauge, _generate_key_name(name, attributes))],
)
self.map[key] = Observation(DEFAULT_GAUGE_VALUE, attributes)
return gauge
def read_gauge(self, key: str, *args) -> Iterable[Observation]:
"""Callback for the Observable Gauges, returns the Observation for the provided key."""
yield self.map[key]
def poke_gauge(self, name: str, attributes: Attributes = None) -> GaugeValues:
"""
Returns the value of the gauge; creates a new one with the default value if it did not exist.
:param name: The name of the gauge to fetch or create.
:param attributes: Gauge attributes, used to generate a unique key to store the gauge.
:returns: The integer or float value last recorded for the provided Gauge name.
"""
key = _generate_key_name(name, attributes)
if key not in self.map:
self._create_gauge(name, attributes)
return self.map[key].value
def get_otel_logger(cls) -> SafeOtelLogger:
host = conf.get("metrics", "otel_host") # ex: "breeze-otel-collector"
port = conf.getint("metrics", "otel_port") # ex: 4318
prefix = conf.get("metrics", "otel_prefix") # ex: "airflow"
ssl_active = conf.getboolean("metrics", "otel_ssl_active")
# PeriodicExportingMetricReader will default to an interval of 60000 millis.
interval = conf.getint("metrics", "otel_interval_milliseconds", fallback=None) # ex: 30000
debug = conf.getboolean("metrics", "otel_debugging_on")
allow_list = conf.get("metrics", "metrics_allow_list", fallback=None)
allow_list_validator = AllowListValidator(allow_list)
resource = Resource(attributes={SERVICE_NAME: "Airflow"})
protocol = "https" if ssl_active else "http"
endpoint = f"{protocol}://{host}:{port}/v1/metrics"
logging.info("[Metric Exporter] Connecting to OpenTelemetry Collector at %s", endpoint)
readers = [
PeriodicExportingMetricReader(
OTLPMetricExporter(
endpoint=endpoint,
headers={"Content-Type": "application/json"},
),
export_interval_millis=interval,
)
]
if debug:
export_to_console = PeriodicExportingMetricReader(ConsoleMetricExporter())
readers.append(export_to_console)
metrics.set_meter_provider(
MeterProvider(
resource=resource,
metric_readers=readers,
shutdown_on_exit=False,
),
)
return SafeOtelLogger(metrics.get_meter_provider(), prefix, allow_list_validator)