-
Notifications
You must be signed in to change notification settings - Fork 192
/
Copy pathsystem-metrics.yaml
541 lines (503 loc) · 16.1 KB
/
system-metrics.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
groups:
# General system attributes
- id: attributes.system
prefix: system
type: attribute_group
brief: "Describes System metric attributes"
attributes:
- id: device
type: string
stability: experimental
brief: "The device identifier"
examples: ["(identifier)"]
# system.cpu.* metrics and attribute group
- id: attributes.system.cpu
prefix: system.cpu
type: attribute_group
brief: "Describes System CPU metric attributes"
attributes:
- id: state
type:
allow_custom_values: true
members:
- id: user
value: 'user'
- id: system
value: 'system'
- id: nice
value: 'nice'
- id: idle
value: 'idle'
- id: iowait
value: 'iowait'
- id: interrupt
value: 'interrupt'
- id: steal
value: 'steal'
brief: "The CPU state for this data point. A system's CPU SHOULD be characterized *either* by data points with no `state` labels, *or only* data points with `state` labels."
stability: experimental
examples: ["idle", "interrupt"]
- id: logical_number
type: int
stability: experimental
brief: "The logical CPU number [0..n-1]"
examples: [1]
- id: metric.system.cpu.time
type: metric
metric_name: system.cpu.time
stability: experimental
brief: "Seconds each logical CPU spent on each mode"
instrument: counter
unit: "s"
attributes:
- ref: system.cpu.state
- ref: system.cpu.logical_number
- id: metric.system.cpu.utilization
type: metric
metric_name: system.cpu.utilization
stability: experimental
brief: "Difference in system.cpu.time since the last measurement, divided by the elapsed time and number of logical CPUs"
instrument: gauge
unit: "1"
attributes:
- ref: system.cpu.state
- ref: system.cpu.logical_number
- id: metric.system.cpu.frequency
type: metric
metric_name: system.cpu.frequency
stability: experimental
brief: "Reports the current frequency of the CPU in Hz"
instrument: gauge
unit: "{Hz}"
attributes:
- ref: system.cpu.logical_number
- id: metric.system.cpu.physical.count
type: metric
metric_name: system.cpu.physical.count
stability: experimental
brief: "Reports the number of actual physical processor cores on the hardware"
instrument: updowncounter
unit: "{cpu}"
attributes: []
- id: metric.system.cpu.logical.count
type: metric
metric_name: system.cpu.logical.count
stability: experimental
brief: "Reports the number of logical (virtual) processor cores created by the operating system to manage multitasking"
instrument: updowncounter
unit: "{cpu}"
attributes: []
# sytem.memory.* metrics and attribute group
- id: attributes.system.memory
prefix: system.memory
type: attribute_group
brief: "Describes System Memory metric attributes"
attributes:
- id: state
type:
allow_custom_values: true
members:
- id: used
value: 'used'
- id: free
value: 'free'
- id: shared
value: 'shared'
- id: buffers
value: 'buffers'
- id: cached
value: 'cached'
stability: experimental
brief: "The memory state"
examples: ["free", "cached"]
- id: metric.system.memory.usage
type: metric
metric_name: system.memory.usage
stability: experimental
brief: "Reports memory in use by state."
note: |
The sum over all `system.memory.state` values SHOULD equal the total memory
available on the system, that is `system.memory.limit`.
instrument: updowncounter
unit: "By"
attributes:
- ref: system.memory.state
- id: metric.system.memory.limit
type: metric
metric_name: system.memory.limit
stability: experimental
brief: "Total memory available in the system."
note: |
Its value SHOULD equal the sum of `system.memory.state` over all states.
instrument: updowncounter
unit: "By"
- id: metric.system.memory.utilization
type: metric
metric_name: system.memory.utilization
stability: experimental
brief: ""
instrument: gauge
unit: "1"
attributes:
- ref: system.memory.state
# system.paging.* metrics and attribute group
- id: attributes.system.paging
prefix: system.paging
type: attribute_group
brief: "Describes System Memory Paging metric attributes"
attributes:
- id: state
type:
allow_custom_values: false
members:
- id: used
value: 'used'
- id: free
value: 'free'
stability: experimental
brief: "The memory paging state"
examples: ["free"]
- id: type
type:
allow_custom_values: false
members:
- id: major
value: 'major'
- id: minor
value: 'minor'
stability: experimental
brief: "The memory paging type"
examples: ["minor"]
- id: direction
type:
allow_custom_values: false
members:
- id: in
value: 'in'
- id: out
value: 'out'
stability: experimental
brief: "The paging access direction"
examples: ["in"]
- id: metric.system.paging.usage
type: metric
metric_name: system.paging.usage
stability: experimental
brief: "Unix swap or windows pagefile usage"
instrument: updowncounter
unit: "By"
attributes:
- ref: system.paging.state
- id: metric.system.paging.utilization
type: metric
metric_name: system.paging.utilization
stability: experimental
brief: ""
instrument: gauge
unit: "1"
attributes:
- ref: system.paging.state
- id: metric.system.paging.faults
type: metric
metric_name: system.paging.faults
stability: experimental
brief: ""
instrument: counter
unit: "{fault}"
attributes:
- ref: system.paging.type
- id: metric.system.paging.operations
type: metric
metric_name: system.paging.operations
stability: experimental
brief: ""
instrument: counter
unit: "{operation}"
attributes:
- ref: system.paging.type
- ref: system.paging.direction
# system.disk.* metrics and attribute group
- id: metric.system.disk.io
type: metric
metric_name: system.disk.io
stability: experimental
brief: ""
instrument: counter
unit: "By"
attributes:
- ref: system.device
- ref: disk.io.direction
- id: metric.system.disk.operations
type: metric
metric_name: system.disk.operations
stability: experimental
brief: ""
instrument: counter
unit: "{operation}"
attributes:
- ref: system.device
- ref: disk.io.direction
- id: metric.system.disk.io_time
type: metric
metric_name: system.disk.io_time
stability: experimental
brief: "Time disk spent activated"
instrument: counter
unit: "s"
note: |
The real elapsed time ("wall clock") used in the I/O path (time from operations running in parallel are not counted). Measured as:
- Linux: Field 13 from [procfs-diskstats](https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats)
- Windows: The complement of
["Disk\% Idle Time"](https://learn.microsoft.com/archive/blogs/askcore/windows-performance-monitor-disk-counters-explained#windows-performance-monitor-disk-counters-explained)
performance counter: `uptime * (100 - "Disk\% Idle Time") / 100`
attributes:
- ref: system.device
- id: metric.system.disk.operation_time
type: metric
metric_name: system.disk.operation_time
stability: experimental
brief: "Sum of the time each operation took to complete"
instrument: counter
unit: "s"
note: |
Because it is the sum of time each request took, parallel-issued requests each contribute to make the count grow. Measured as:
- Linux: Fields 7 & 11 from [procfs-diskstats](https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats)
- Windows: "Avg. Disk sec/Read" perf counter multiplied by "Disk Reads/sec" perf counter (similar for Writes)
attributes:
- ref: system.device
- ref: disk.io.direction
- id: metric.system.disk.merged
type: metric
metric_name: system.disk.merged
stability: experimental
brief: ""
instrument: counter
unit: "{operation}"
attributes:
- ref: system.device
- ref: disk.io.direction
# system.filesystem.* metrics and attribute group
- id: attributes.system.filesystem
prefix: system.filesystem
type: attribute_group
brief: "Describes Filesystem metric attributes"
attributes:
- id: state
brief: "The filesystem state"
type:
allow_custom_values: false
members:
- id: used
value: 'used'
- id: free
value: 'free'
- id: reserved
value: 'reserved'
stability: experimental
examples: ["used"]
- id: type
type:
allow_custom_values: true
members:
- id: fat32
value: 'fat32'
- id: exfat
value: 'exfat'
- id: ntfs
value: 'ntfs'
- id: refs
value: 'refs'
- id: hfsplus
value: 'hfsplus'
- id: ext4
value: 'ext4'
stability: experimental
brief: "The filesystem type"
examples: ["ext4"]
- id: mode
type: string
stability: experimental
brief: "The filesystem mode"
examples: ["rw, ro"]
- id: mountpoint
type: string
stability: experimental
brief: "The filesystem mount path"
examples: ["/mnt/data"]
- id: metric.system.filesystem.usage
type: metric
metric_name: system.filesystem.usage
stability: experimental
brief: ""
instrument: updowncounter
unit: "By"
attributes:
- ref: system.device
- ref: system.filesystem.state
- ref: system.filesystem.type
- ref: system.filesystem.mode
- ref: system.filesystem.mountpoint
- id: metric.system.filesystem.utilization
type: metric
metric_name: system.filesystem.utilization
stability: experimental
brief: ""
instrument: gauge
unit: "1"
attributes:
- ref: system.device
- ref: system.filesystem.state
- ref: system.filesystem.type
- ref: system.filesystem.mode
- ref: system.filesystem.mountpoint
# system.network.* metrics and attribute group
# System-specific network attributes
- id: attributes.system.network
prefix: system.network
type: attribute_group
brief: "Describes Network metric attributes"
attributes:
- id: state
type:
allow_custom_values: false
members:
- id: close
value: 'close'
- id: close_wait
value: 'close_wait'
- id: closing
value: 'closing'
- id: delete
value: 'delete'
- id: established
value: 'established'
- id: fin_wait_1
value: 'fin_wait_1'
- id: fin_wait_2
value: 'fin_wait_2'
- id: last_ack
value: 'last_ack'
- id: listen
value: 'listen'
- id: syn_recv
value: 'syn_recv'
- id: syn_sent
value: 'syn_sent'
- id: time_wait
value: 'time_wait'
stability: experimental
brief: "A stateless protocol MUST NOT set this attribute"
examples: ["close_wait"]
- id: metric.system.network.dropped
type: metric
metric_name: system.network.dropped
stability: experimental
brief: "Count of packets that are dropped or discarded even though there was no error"
instrument: counter
unit: "{packet}"
note: |
Measured as:
- Linux: the `drop` column in `/proc/dev/net` ([source](https://web.archive.org/web/20180321091318/http://www.onlamp.com/pub/a/linux/2000/11/16/LinuxAdmin.html))
- Windows: [`InDiscards`/`OutDiscards`](https://docs.microsoft.com/windows/win32/api/netioapi/ns-netioapi-mib_if_row2)
from [`GetIfEntry2`](https://docs.microsoft.com/windows/win32/api/netioapi/nf-netioapi-getifentry2)
attributes:
- ref: system.device
- ref: network.io.direction
- id: metric.system.network.packets
type: metric
metric_name: system.network.packets
stability: experimental
brief: ""
instrument: counter
unit: "{packet}"
attributes:
- ref: system.device
- ref: network.io.direction
- id: metric.system.network.errors
type: metric
metric_name: system.network.errors
stability: experimental
brief: "Count of network errors detected"
instrument: counter
unit: "{error}"
note: |
Measured as:
- Linux: the `errs` column in `/proc/dev/net` ([source](https://web.archive.org/web/20180321091318/http://www.onlamp.com/pub/a/linux/2000/11/16/LinuxAdmin.html)).
- Windows: [`InErrors`/`OutErrors`](https://docs.microsoft.com/windows/win32/api/netioapi/ns-netioapi-mib_if_row2)
from [`GetIfEntry2`](https://docs.microsoft.com/windows/win32/api/netioapi/nf-netioapi-getifentry2).
attributes:
- ref: system.device
- ref: network.io.direction
- id: metric.system.network.io
type: metric
metric_name: system.network.io
stability: experimental
brief: ""
instrument: counter
unit: "By"
attributes:
- ref: system.device
- ref: network.io.direction
- id: metric.system.network.connections
type: metric
metric_name: system.network.connections
stability: experimental
brief: ""
instrument: updowncounter
unit: "{connection}"
attributes:
- ref: system.device
- ref: system.network.state
- ref: network.transport
# system.process.* metrics and attribute group
- id: attributes.system.process
prefix: system.process
type: attribute_group
brief: "Describes System Process metric attributes"
attributes:
- id: status
type:
allow_custom_values: true
members:
- id: running
value: 'running'
- id: sleeping
value: 'sleeping'
- id: stopped
value: 'stopped'
- id: defunct
value: 'defunct'
stability: experimental
brief: >
The process state, e.g., [Linux Process State Codes](https://man7.org/linux/man-pages/man1/ps.1.html#PROCESS_STATE_CODES)
examples: ["running"]
- id: metric.system.process.count
type: metric
metric_name: system.process.count
stability: experimental
brief: "Total number of processes in each state"
instrument: updowncounter
unit: "{process}"
attributes:
- ref: system.process.status
- id: metric.system.process.created
type: metric
metric_name: system.process.created
stability: experimental
brief: "Total number of processes created over uptime of the host"
instrument: counter
unit: "{process}"
# system.linux.* metrics
- id: metric.system.linux.memory.available
type: metric
metric_name: system.linux.memory.available
stability: experimental
brief: "An estimate of how much memory is available for starting new applications, without causing swapping"
note: |
This is an alternative to `system.memory.usage` metric with `state=free`.
Linux starting from 3.14 exports "available" memory. It takes "free" memory as a baseline, and then factors in kernel-specific values.
This is supposed to be more accurate than just "free" memory.
For reference, see the calculations [here](https://superuser.com/a/980821).
See also `MemAvailable` in [/proc/meminfo](https://man7.org/linux/man-pages/man5/proc.5.html).
instrument: updowncounter
unit: "By"