diff --git a/sdk/monitor/monitor-opentelemetry/CHANGELOG.md b/sdk/monitor/monitor-opentelemetry/CHANGELOG.md index 670deca74661..7a39e323c9e2 100644 --- a/sdk/monitor/monitor-opentelemetry/CHANGELOG.md +++ b/sdk/monitor/monitor-opentelemetry/CHANGELOG.md @@ -3,6 +3,7 @@ ## 1.7.2 (Unreleased) ### Features Added +- Changed live metrics CPU/Memory perf counter metrics to emit normalized process CPU and process physical memory bytes. - Support for Live Metrics Filtering ### Breaking Changes diff --git a/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/liveMetrics.ts b/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/liveMetrics.ts index 310858a482df..178db2774ea0 100644 --- a/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/liveMetrics.ts +++ b/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/liveMetrics.ts @@ -77,6 +77,7 @@ import { MetricFailureToCreateError, } from "./filtering/quickpulseErrors"; import { SEMATTRS_EXCEPTION_TYPE } from "@opentelemetry/semantic-conventions"; +import { getPhysicalMemory, getProcessorTimeNormalized } from "../utils"; const POST_INTERVAL = 1000; const MAX_POST_WAIT_TIME = 20000; @@ -99,8 +100,8 @@ export class LiveMetrics { private requestFailedRateGauge: ObservableGauge | undefined; private dependencyRateGauge: ObservableGauge | undefined; private dependencyFailedRateGauge: ObservableGauge | undefined; - private memoryCommitedGauge: ObservableGauge | undefined; - private processorTimeGauge: ObservableGauge | undefined; + private processPhysicalBytesGauge: ObservableGauge | undefined; + private percentProcessorTimeNormalizedGauge: ObservableGauge | undefined; private exceptionsRateGauge: ObservableGauge | undefined; private documents: DocumentIngress[] = []; @@ -135,13 +136,8 @@ export class LiveMetrics { private lastDependencyRate: { count: number; time: number } = { count: 0, time: 0 }; private lastFailedDependencyRate: { count: number; time: number } = { count: 0, time: 0 }; private lastExceptionRate: { count: number; time: number } = { count: 0, time: 0 }; - private lastCpus: - | { - model: string; - speed: number; - times: { user: number; nice: number; sys: number; idle: number; irq: number }; - }[] - | undefined; + private lastCpuUsage: NodeJS.CpuUsage; + private lastHrTime: bigint; private statsbeatOptionsUpdated = false; private etag: string = ""; private errorTracker: CollectionConfigurationErrorTracker = @@ -209,6 +205,8 @@ export class LiveMetrics { // eslint-disable-next-line @typescript-eslint/no-misused-promises this.handle = setTimeout(this.goQuickpulse.bind(this), this.pingInterval); this.handle.unref(); // Don't block apps from terminating + this.lastCpuUsage = process.cpuUsage(); + this.lastHrTime = process.hrtime.bigint(); } public shutdown(): void { @@ -299,7 +297,6 @@ export class LiveMetrics { getInstance().setStatsbeatFeatures({}, { liveMetrics: true }); this.statsbeatOptionsUpdated = true; } - this.lastCpus = os.cpus(); this.totalDependencyCount = 0; this.totalExceptionCount = 0; this.totalFailedDependencyCount = 0; @@ -363,15 +360,15 @@ export class LiveMetrics { }, ); - this.memoryCommitedGauge = this.meter.createObservableGauge( - QuickPulseOpenTelemetryMetricNames.COMMITTED_BYTES, + this.processPhysicalBytesGauge = this.meter.createObservableGauge( + QuickPulseOpenTelemetryMetricNames.PHYSICAL_BYTES, { valueType: ValueType.INT, }, ); - this.processorTimeGauge = this.meter.createObservableGauge( - QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME, + this.percentProcessorTimeNormalizedGauge = this.meter.createObservableGauge( + QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME_NORMALIZED, { valueType: ValueType.DOUBLE, }, @@ -390,8 +387,10 @@ export class LiveMetrics { this.dependencyRateGauge.addCallback(this.getDependencyRate.bind(this)); this.dependencyFailedRateGauge.addCallback(this.getDependencyFailedRate.bind(this)); this.exceptionsRateGauge.addCallback(this.getExceptionRate.bind(this)); - this.memoryCommitedGauge.addCallback(this.getCommitedMemory.bind(this)); - this.processorTimeGauge.addCallback(this.getProcessorTime.bind(this)); + this.processPhysicalBytesGauge.addCallback(this.getPhysicalMemory.bind(this)); + this.percentProcessorTimeNormalizedGauge.addCallback( + this.getProcessorTimeNormalized.bind(this), + ); } /** @@ -652,69 +651,16 @@ export class LiveMetrics { }; } - private getCommitedMemory(observableResult: ObservableResult): void { - const freeMem = os.freemem(); - const committedMemory = os.totalmem() - freeMem; - observableResult.observe(committedMemory); - } - - private getTotalCombinedCpu( - cpus: os.CpuInfo[], - lastCpus: os.CpuInfo[], - ): { combinedTotal: number; totalUser: number; totalIdle: number } { - let totalUser = 0; - let totalSys = 0; - let totalNice = 0; - let totalIdle = 0; - let totalIrq = 0; - for (let i = 0; !!cpus && i < cpus.length; i++) { - const cpu = cpus[i]; - const lastCpu = lastCpus[i]; - const times = cpu.times; - const lastTimes = lastCpu.times; - // user cpu time (or) % CPU time spent in user space - let user = times.user - lastTimes.user; - user = user > 0 ? user : 0; // Avoid negative values - totalUser += user; - // system cpu time (or) % CPU time spent in kernel space - let sys = times.sys - lastTimes.sys; - sys = sys > 0 ? sys : 0; // Avoid negative values - totalSys += sys; - // user nice cpu time (or) % CPU time spent on low priority processes - let nice = times.nice - lastTimes.nice; - nice = nice > 0 ? nice : 0; // Avoid negative values - totalNice += nice; - // idle cpu time (or) % CPU time spent idle - let idle = times.idle - lastTimes.idle; - idle = idle > 0 ? idle : 0; // Avoid negative values - totalIdle += idle; - // irq (or) % CPU time spent servicing/handling hardware interrupts - let irq = times.irq - lastTimes.irq; - irq = irq > 0 ? irq : 0; // Avoid negative values - totalIrq += irq; - } - const combinedTotal = totalUser + totalSys + totalNice + totalIdle + totalIrq; - return { - combinedTotal: combinedTotal, - totalUser: totalUser, - totalIdle: totalIdle, - }; + private getPhysicalMemory(observableResult: ObservableResult): void { + const rss = getPhysicalMemory(); + observableResult.observe(rss); } - private getProcessorTime(observableResult: ObservableResult): void { - // this reports total ms spent in each category since the OS was booted, to calculate percent it is necessary - // to find the delta since the last measurement - const cpus = os.cpus(); - if (cpus && cpus.length && this.lastCpus && cpus.length === this.lastCpus.length) { - const cpuTotals = this.getTotalCombinedCpu(cpus, this.lastCpus); - - const value = - cpuTotals.combinedTotal > 0 - ? ((cpuTotals.combinedTotal - cpuTotals.totalIdle) / cpuTotals.combinedTotal) * 100 - : 0; - observableResult.observe(value); - } - this.lastCpus = cpus; + private getProcessorTimeNormalized(observableResult: ObservableResult): void { + const cpuUsagePercent = getProcessorTimeNormalized(this.lastHrTime, this.lastCpuUsage); + observableResult.observe(cpuUsagePercent); + this.lastHrTime = process.hrtime.bigint(); + this.lastCpuUsage = process.cpuUsage(); } private updateConfiguration(response: PublishResponse | IsSubscribedResponse): void { diff --git a/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/types.ts b/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/types.ts index 3436613a0e7c..f26ae3366197 100644 --- a/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/types.ts +++ b/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/types.ts @@ -31,8 +31,8 @@ export interface QuickpulseExporterOptions { } export enum QuickPulseOpenTelemetryMetricNames { - COMMITTED_BYTES = "azureMonitor.memoryCommittedBytes", - PROCESSOR_TIME = "azureMonitor.processorTotalProcessorTime", + PHYSICAL_BYTES = "azureMonitor.physicalBytes", + PROCESSOR_TIME_NORMALIZED = "azureMonitor.percentProcessorTimeNormalized", REQUEST_RATE = "azureMonitor.requestsSec", REQUEST_FAILURE_RATE = "azureMonitor.requestsFailedSec", REQUEST_DURATION = "azureMonitor.requestDuration", @@ -43,10 +43,14 @@ export enum QuickPulseOpenTelemetryMetricNames { } export enum QuickPulseMetricNames { - // Memory + // Memory (old) COMMITTED_BYTES = "\\Memory\\Committed Bytes", - // CPU + // Memory (new - current process) + PHYSICAL_BYTES = "\\Process\\Physical Bytes", + // CPU (old) PROCESSOR_TIME = "\\Processor(_Total)\\% Processor Time", + // CPU (new - current process) + PROCESSOR_TIME_NORMALIZED = "\\% Process\\Processor Time Normalized", // Request REQUEST_RATE = "\\ApplicationInsights\\Requests/Sec", REQUEST_FAILURE_RATE = "\\ApplicationInsights\\Requests Failed/Sec", diff --git a/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/utils.ts b/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/utils.ts index a9d1200525e6..33ffe0ea3b02 100644 --- a/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/utils.ts +++ b/sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/utils.ts @@ -185,8 +185,8 @@ export function resourceMetricsToQuickpulseDataPoint( // Update name to expected value in Quickpulse, needed because those names are invalid in OTel switch (metric.descriptor.name) { - case QuickPulseOpenTelemetryMetricNames.COMMITTED_BYTES: - metricPoint.name = QuickPulseMetricNames.COMMITTED_BYTES; + case QuickPulseOpenTelemetryMetricNames.PHYSICAL_BYTES: + metricPoint.name = QuickPulseMetricNames.PHYSICAL_BYTES; break; case QuickPulseOpenTelemetryMetricNames.DEPENDENCY_DURATION: metricPoint.name = QuickPulseMetricNames.DEPENDENCY_DURATION; @@ -200,8 +200,8 @@ export function resourceMetricsToQuickpulseDataPoint( case QuickPulseOpenTelemetryMetricNames.EXCEPTION_RATE: metricPoint.name = QuickPulseMetricNames.EXCEPTION_RATE; break; - case QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME: - metricPoint.name = QuickPulseMetricNames.PROCESSOR_TIME; + case QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME_NORMALIZED: + metricPoint.name = QuickPulseMetricNames.PROCESSOR_TIME_NORMALIZED; break; case QuickPulseOpenTelemetryMetricNames.REQUEST_DURATION: metricPoint.name = QuickPulseMetricNames.REQUEST_DURATION; @@ -225,6 +225,23 @@ export function resourceMetricsToQuickpulseDataPoint( metricPoint.value = (dataPoint.value as Histogram).sum || 0; } metricPoints.push(metricPoint); + + // TODO: remove the metric points with the old metric names after + // UI side has done their changes to support the new names. + if ( + metricPoint.name === QuickPulseMetricNames.PHYSICAL_BYTES || + metricPoint.name === QuickPulseMetricNames.PROCESSOR_TIME_NORMALIZED + ) { + const oldMetricPoint: MetricPoint = { + weight: 1, + name: + metricPoint.name === QuickPulseMetricNames.PHYSICAL_BYTES + ? QuickPulseMetricNames.COMMITTED_BYTES + : QuickPulseMetricNames.PROCESSOR_TIME, + value: dataPoint.value as number, + }; + metricPoints.push(oldMetricPoint); + } }); }); }); diff --git a/sdk/monitor/monitor-opentelemetry/src/metrics/utils.ts b/sdk/monitor/monitor-opentelemetry/src/metrics/utils.ts index a3736f04a376..1f56b21ac0bb 100644 --- a/sdk/monitor/monitor-opentelemetry/src/metrics/utils.ts +++ b/sdk/monitor/monitor-opentelemetry/src/metrics/utils.ts @@ -34,6 +34,7 @@ import { } from "./types"; import { LogRecord } from "@opentelemetry/sdk-logs"; import { Resource } from "@opentelemetry/resources"; +import * as os from "os"; export function getRequestDimensions(span: ReadableSpan): Attributes { const dimensions: MetricRequestDimensions = getBaseDimensions(span.resource); @@ -171,3 +172,27 @@ export function convertDimensions( } return convertedDimensions as Attributes; } + +// to get physical memory bytes +export function getPhysicalMemory(): number { + return process.memoryUsage.rss(); +} + +// This function can get the normalized cpu, but it assumes that after this function is called, +// that the process.hrtime.bigint() & process.cpuUsage() are called/stored to be used as the +// parameters for the next call. +export function getProcessorTimeNormalized( + lastHrTime: bigint, + lastCpuUsage: NodeJS.CpuUsage, +): number { + let numCpus = os.cpus().length; + const usageDif = process.cpuUsage(lastCpuUsage); + const elapsedTimeNs = process.hrtime.bigint() - lastHrTime; + + const usageDifMs = (usageDif.user + usageDif.system) / 1000.0; + const elapsedTimeMs = elapsedTimeNs === BigInt(0) ? 1 : Number(elapsedTimeNs) / 1000000.0; + // just for division safety, don't know a case in which this would actually happen + numCpus = numCpus === 0 ? 1 : numCpus; + + return (usageDifMs / elapsedTimeMs / numCpus) * 100; +} diff --git a/sdk/monitor/monitor-opentelemetry/test/internal/unit/metrics/liveMetrics.test.ts b/sdk/monitor/monitor-opentelemetry/test/internal/unit/metrics/liveMetrics.test.ts index 26a2d51a31e8..a1f0c7c33a78 100644 --- a/sdk/monitor/monitor-opentelemetry/test/internal/unit/metrics/liveMetrics.test.ts +++ b/sdk/monitor/monitor-opentelemetry/test/internal/unit/metrics/liveMetrics.test.ts @@ -7,10 +7,14 @@ import { ExportResultCode, millisToHrTime } from "@opentelemetry/core"; import { LoggerProvider, LogRecord } from "@opentelemetry/sdk-logs"; import { LiveMetrics } from "../../../../src/metrics/quickpulse/liveMetrics"; import { InternalConfig } from "../../../../src/shared"; -import { QuickPulseOpenTelemetryMetricNames } from "../../../../src/metrics/quickpulse/types"; +import { + QuickPulseMetricNames, + QuickPulseOpenTelemetryMetricNames, +} from "../../../../src/metrics/quickpulse/types"; /* eslint-disable-next-line @typescript-eslint/no-redeclare */ import { Exception, RemoteDependency, Request } from "../../../../src/generated"; import { AccessToken, TokenCredential } from "@azure/core-auth"; +import { resourceMetricsToQuickpulseDataPoint } from "../../../../src/metrics/quickpulse/utils"; describe("#LiveMetrics", () => { let exportStub: sinon.SinonStub; @@ -169,16 +173,16 @@ describe("#LiveMetrics", () => { assert.ok(metrics[5].dataPoints[0].value > 0, "DEPENDENCY_FAILURE_RATE value"); assert.strictEqual( metrics[6].descriptor.name, - QuickPulseOpenTelemetryMetricNames.COMMITTED_BYTES, + QuickPulseOpenTelemetryMetricNames.PHYSICAL_BYTES, ); assert.strictEqual(metrics[6].dataPoints.length, 1, "dataPoints count"); - assert.ok(metrics[6].dataPoints[0].value > 0, "COMMITTED_BYTES dataPoint value"); + assert.ok(metrics[6].dataPoints[0].value > 0, "PHYSICAL_BYTES dataPoint value"); assert.strictEqual( metrics[7].descriptor.name, - QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME, + QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME_NORMALIZED, ); assert.strictEqual(metrics[7].dataPoints.length, 1, "dataPoints count"); - assert.ok(metrics[7].dataPoints[0].value >= 0, "PROCESSOR_TIME dataPoint value"); + assert.ok(metrics[7].dataPoints[0].value >= 0, "PROCESSOR_TIME_NORMALIZED dataPoint value"); assert.strictEqual( metrics[8].descriptor.name, QuickPulseOpenTelemetryMetricNames.EXCEPTION_RATE, @@ -226,6 +230,38 @@ describe("#LiveMetrics", () => { assert.equal((documents[i].properties as any)[0].key, "customAttribute"); assert.equal((documents[i].properties as any)[0].value, "test"); } + + // testing that the old/new names for the perf counters appear in the monitoring data point, + // with the values of the process counters + const monitoringDataPoints = resourceMetricsToQuickpulseDataPoint( + resourceMetrics, + autoCollect["quickpulseExporter"]["baseMonitoringDataPoint"], + documents, + [], + new Map(), + ); + assert.ok(monitoringDataPoints[0].metrics?.length === 11); + assert.ok( + monitoringDataPoints[0].metrics[6].name === QuickPulseMetricNames.PHYSICAL_BYTES.toString(), + ); + assert.ok(monitoringDataPoints[0].metrics[6].value > 0); + assert.ok( + monitoringDataPoints[0].metrics[7].name === QuickPulseMetricNames.COMMITTED_BYTES.toString(), + ); + assert.ok( + monitoringDataPoints[0].metrics[7].value === monitoringDataPoints[0].metrics[6].value, + ); + assert.ok( + monitoringDataPoints[0].metrics[8].name === + QuickPulseMetricNames.PROCESSOR_TIME_NORMALIZED.toString(), + ); + assert.ok(monitoringDataPoints[0].metrics[8].value >= 0); + assert.ok( + monitoringDataPoints[0].metrics[9].name === QuickPulseMetricNames.PROCESSOR_TIME.toString(), + ); + assert.ok( + monitoringDataPoints[0].metrics[9].value === monitoringDataPoints[0].metrics[8].value, + ); }); it("should retrieve meter provider", () => {