From 89a4a32d5607ca9d9aec7b31f35a747e68060ee4 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 16 May 2024 15:55:14 +0200 Subject: [PATCH 1/2] Fix missing GPM metrics Signed-off-by: Evan Lezar --- pkg/nvml/gpm.go | 19 ++++++++--- pkg/nvml/gpm_test.go | 79 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 pkg/nvml/gpm_test.go diff --git a/pkg/nvml/gpm.go b/pkg/nvml/gpm.go index acdb2e0..2d60d07 100644 --- a/pkg/nvml/gpm.go +++ b/pkg/nvml/gpm.go @@ -51,20 +51,31 @@ func (g *nvmlGpmMetricsGetType) convert() *GpmMetricsGetType { // nvml.GpmMetricsGet() type GpmMetricsGetVType struct { - metricsGet *nvmlGpmMetricsGetType + metricsGet *GpmMetricsGetType } func (l *library) GpmMetricsGetV(metricsGet *GpmMetricsGetType) GpmMetricsGetVType { - return GpmMetricsGetVType{metricsGet.convert()} + return GpmMetricsGetVType{metricsGet} } + +// nvmlGpmMetricsGetStub is a stub function that can be overridden for testing. +var nvmlGpmMetricsGetStub = nvmlGpmMetricsGet + func (metricsGetV GpmMetricsGetVType) V1() Return { metricsGetV.metricsGet.Version = 1 - return nvmlGpmMetricsGet(metricsGetV.metricsGet) + return gpmMetricsGet(metricsGetV.metricsGet) } func (l *library) GpmMetricsGet(metricsGet *GpmMetricsGetType) Return { metricsGet.Version = GPM_METRICS_GET_VERSION - return nvmlGpmMetricsGet(metricsGet.convert()) + return gpmMetricsGet(metricsGet) +} + +func gpmMetricsGet(metricsGet *GpmMetricsGetType) Return { + nvmlMetricsGet := metricsGet.convert() + ret := nvmlGpmMetricsGetStub(nvmlMetricsGet) + *metricsGet = *nvmlMetricsGet.convert() + return ret } // nvml.GpmSampleFree() diff --git a/pkg/nvml/gpm_test.go b/pkg/nvml/gpm_test.go new file mode 100644 index 0000000..24368d7 --- /dev/null +++ b/pkg/nvml/gpm_test.go @@ -0,0 +1,79 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvml + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGpmMetricsGet(t *testing.T) { + overrideMetrics := [98]GpmMetric{ + { + Value: 99, + }, + } + defer setNvmlGpmMetricsGetStubForTest(func(metricsGet *nvmlGpmMetricsGetType) Return { + metricsGet.Metrics = overrideMetrics + return SUCCESS + })() + + metrics := GpmMetricsGetType{ + Sample1: nvmlGpmSample{}, + Sample2: nvmlGpmSample{}, + } + ret := GpmMetricsGet(&metrics) + + require.Equal(t, SUCCESS, ret) + require.EqualValues(t, GPM_METRICS_GET_VERSION, metrics.Version) + + require.EqualValues(t, overrideMetrics, metrics.Metrics) +} + +func TestGpmMetricsGetV(t *testing.T) { + overrideMetrics := [98]GpmMetric{ + { + Value: 99, + }, + } + defer setNvmlGpmMetricsGetStubForTest(func(metricsGet *nvmlGpmMetricsGetType) Return { + metricsGet.Metrics = overrideMetrics + return SUCCESS + })() + + metrics := GpmMetricsGetType{ + Sample1: nvmlGpmSample{}, + Sample2: nvmlGpmSample{}, + } + + ret := GpmMetricsGetV(&metrics).V1() + + require.Equal(t, SUCCESS, ret) + require.EqualValues(t, GPM_METRICS_GET_VERSION, metrics.Version) + + require.EqualValues(t, overrideMetrics, metrics.Metrics) +} + +func setNvmlGpmMetricsGetStubForTest(mock func(metricsGet *nvmlGpmMetricsGetType) Return) func() { + original := nvmlGpmMetricsGetStub + + nvmlGpmMetricsGetStub = mock + return func() { + nvmlGpmMetricsGetStub = original + } +} From 72a248ccda355b132e63a3ed8b589d307216eb01 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 16 May 2024 14:02:12 +0200 Subject: [PATCH 2/2] Add basic gpm metrics example Signed-off-by: Evan Lezar --- examples/gpm-metrics/main.go | 111 +++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 examples/gpm-metrics/main.go diff --git a/examples/gpm-metrics/main.go b/examples/gpm-metrics/main.go new file mode 100644 index 0000000..c61638a --- /dev/null +++ b/examples/gpm-metrics/main.go @@ -0,0 +1,111 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package main + +import ( + "fmt" + "log" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +func main() { + ret := nvml.Init() + if ret != nvml.SUCCESS { + log.Fatalf("failed to init NVML: %v", ret) + } + defer func() { + _ = nvml.Shutdown() + }() + + count, ret := nvml.DeviceGetCount() + if ret != nvml.SUCCESS { + log.Fatalf("failed to get device count: %v", ret) + } + + for i := 0; i < count; i++ { + if err := collectGPMMetrics(i); err != nil { + log.Printf("failed to get metrics for device %d: %v\n", i, err) + } + } +} + +// collectGPMMetrics gets GPM metrics for a specified device. +func collectGPMMetrics(i int) error { + device, ret := nvml.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + return fmt.Errorf("could not get devices handle: %w", ret) + } + + gpuQuerySupport, ret := device.GpmQueryDeviceSupport() + if ret != nvml.SUCCESS { + return fmt.Errorf("could not query GPM support: %w", ret) + } + + if gpuQuerySupport.IsSupportedDevice == 0 { + return fmt.Errorf("GPM queries are not supported") + } + + sample1, ret := nvml.GpmSampleAlloc() + if ret != nvml.SUCCESS { + return fmt.Errorf("could not allocate sample: %w", ret) + } + defer func() { + _ = sample1.Free() + }() + sample2, ret := nvml.GpmSampleAlloc() + if ret != nvml.SUCCESS { + return fmt.Errorf("could not allocate sample: %w", ret) + } + defer func() { + _ = sample2.Free() + }() + + if ret := device.GpmSampleGet(sample1); ret != nvml.SUCCESS { + return fmt.Errorf("could not get sample: %w", ret) + } + // add a delay between samples. + time.Sleep(1 * time.Second) + if ret := device.GpmSampleGet(sample2); ret != nvml.SUCCESS { + return fmt.Errorf("could not get sample: %w", ret) + } + + gpmMetric := nvml.GpmMetricsGetType{ + NumMetrics: 1, + Sample1: sample1, + Sample2: sample2, + Metrics: [98]nvml.GpmMetric{ + { + MetricId: uint32(nvml.GPM_METRIC_GRAPHICS_UTIL), + }, + }, + } + + ret = nvml.GpmMetricsGet(&gpmMetric) + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to get gpm metric: %w", ret) + } + + for i := 0; i < int(gpmMetric.NumMetrics); i++ { + if gpmMetric.Metrics[i].MetricId > 0 { + fmt.Printf("%v: %v\n", gpmMetric.Metrics[i].MetricId, gpmMetric.Metrics[i].Value) + } + } + + return nil +}