From b72647c35b6968112f18978dbbe3292a8191ae88 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Tue, 23 Apr 2024 10:42:45 +0000 Subject: [PATCH 1/7] Update mock dgxa100 server with real values for GI and CI profiles Signed-off-by: Kevin Klues --- pkg/nvml/mock/dgxa100/dgxa100.go | 120 +++++++++++++++---------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index 0188a03..90e7efb 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -64,91 +64,91 @@ var MIGProfiles = struct { IsP2pSupported: 0, SliceCount: 1, InstanceCount: 7, - MultiprocessorCount: 1, + MultiprocessorCount: 14, CopyEngineCount: 1, DecoderCount: 0, EncoderCount: 0, JpegCount: 0, OfaCount: 0, - MemorySizeMB: 5120, + MemorySizeMB: 4864, }, nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, IsP2pSupported: 0, SliceCount: 1, InstanceCount: 1, - MultiprocessorCount: 1, + MultiprocessorCount: 14, CopyEngineCount: 1, DecoderCount: 1, - EncoderCount: 1, + EncoderCount: 0, JpegCount: 1, OfaCount: 1, - MemorySizeMB: 5120, + MemorySizeMB: 4864, }, nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, IsP2pSupported: 0, SliceCount: 1, InstanceCount: 4, - MultiprocessorCount: 1, + MultiprocessorCount: 14, CopyEngineCount: 1, - DecoderCount: 0, + DecoderCount: 1, EncoderCount: 0, JpegCount: 0, OfaCount: 0, - MemorySizeMB: 10240, + MemorySizeMB: 9856, }, nvml.GPU_INSTANCE_PROFILE_2_SLICE: { Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, IsP2pSupported: 0, SliceCount: 2, InstanceCount: 3, - MultiprocessorCount: 2, + MultiprocessorCount: 28, CopyEngineCount: 2, DecoderCount: 1, - EncoderCount: 1, + EncoderCount: 0, JpegCount: 0, OfaCount: 0, - MemorySizeMB: 10240, + MemorySizeMB: 9856, }, nvml.GPU_INSTANCE_PROFILE_3_SLICE: { Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, IsP2pSupported: 0, SliceCount: 3, InstanceCount: 2, - MultiprocessorCount: 3, - CopyEngineCount: 4, + MultiprocessorCount: 42, + CopyEngineCount: 3, DecoderCount: 2, - EncoderCount: 2, + EncoderCount: 0, JpegCount: 0, OfaCount: 0, - MemorySizeMB: 20480, + MemorySizeMB: 19968, }, nvml.GPU_INSTANCE_PROFILE_4_SLICE: { Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, IsP2pSupported: 0, SliceCount: 4, InstanceCount: 1, - MultiprocessorCount: 4, + MultiprocessorCount: 56, CopyEngineCount: 4, DecoderCount: 2, - EncoderCount: 2, + EncoderCount: 0, JpegCount: 0, OfaCount: 0, - MemorySizeMB: 20480, + MemorySizeMB: 19968, }, nvml.GPU_INSTANCE_PROFILE_7_SLICE: { Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, IsP2pSupported: 0, SliceCount: 7, InstanceCount: 1, - MultiprocessorCount: 7, - CopyEngineCount: 8, + MultiprocessorCount: 98, + CopyEngineCount: 7, DecoderCount: 5, - EncoderCount: 5, + EncoderCount: 0, JpegCount: 1, OfaCount: 1, - MemorySizeMB: 40960, + MemorySizeMB: 40192, }, }, ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ @@ -157,7 +157,7 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, SliceCount: 1, InstanceCount: 1, - MultiprocessorCount: 1, + MultiprocessorCount: 14, SharedCopyEngineCount: 1, SharedDecoderCount: 0, SharedEncoderCount: 0, @@ -170,10 +170,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, SliceCount: 1, InstanceCount: 1, - MultiprocessorCount: 1, + MultiprocessorCount: 14, SharedCopyEngineCount: 1, SharedDecoderCount: 1, - SharedEncoderCount: 1, + SharedEncoderCount: 0, SharedJpegCount: 1, SharedOfaCount: 1, }, @@ -183,9 +183,9 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, SliceCount: 1, InstanceCount: 1, - MultiprocessorCount: 1, + MultiprocessorCount: 14, SharedCopyEngineCount: 1, - SharedDecoderCount: 0, + SharedDecoderCount: 1, SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, @@ -196,10 +196,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, SliceCount: 1, InstanceCount: 2, - MultiprocessorCount: 1, + MultiprocessorCount: 14, SharedCopyEngineCount: 2, SharedDecoderCount: 1, - SharedEncoderCount: 1, + SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, }, @@ -207,10 +207,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, SliceCount: 2, InstanceCount: 1, - MultiprocessorCount: 2, + MultiprocessorCount: 28, SharedCopyEngineCount: 2, SharedDecoderCount: 1, - SharedEncoderCount: 1, + SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, }, @@ -220,10 +220,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, SliceCount: 1, InstanceCount: 3, - MultiprocessorCount: 1, - SharedCopyEngineCount: 4, + MultiprocessorCount: 14, + SharedCopyEngineCount: 3, SharedDecoderCount: 2, - SharedEncoderCount: 1, + SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, }, @@ -231,10 +231,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, SliceCount: 2, InstanceCount: 1, - MultiprocessorCount: 2, - SharedCopyEngineCount: 4, + MultiprocessorCount: 28, + SharedCopyEngineCount: 3, SharedDecoderCount: 2, - SharedEncoderCount: 2, + SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, }, @@ -242,8 +242,8 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, SliceCount: 3, InstanceCount: 1, - MultiprocessorCount: 3, - SharedCopyEngineCount: 4, + MultiprocessorCount: 42, + SharedCopyEngineCount: 3, SharedDecoderCount: 2, SharedEncoderCount: 0, SharedJpegCount: 0, @@ -255,10 +255,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, SliceCount: 1, InstanceCount: 4, - MultiprocessorCount: 1, + MultiprocessorCount: 14, SharedCopyEngineCount: 4, SharedDecoderCount: 2, - SharedEncoderCount: 2, + SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, }, @@ -266,10 +266,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, SliceCount: 2, InstanceCount: 2, - MultiprocessorCount: 2, + MultiprocessorCount: 28, SharedCopyEngineCount: 4, SharedDecoderCount: 2, - SharedEncoderCount: 2, + SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, }, @@ -277,10 +277,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, SliceCount: 4, InstanceCount: 1, - MultiprocessorCount: 4, + MultiprocessorCount: 56, SharedCopyEngineCount: 4, SharedDecoderCount: 2, - SharedEncoderCount: 2, + SharedEncoderCount: 0, SharedJpegCount: 0, SharedOfaCount: 0, }, @@ -290,10 +290,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, SliceCount: 1, InstanceCount: 7, - MultiprocessorCount: 1, - SharedCopyEngineCount: 8, + MultiprocessorCount: 14, + SharedCopyEngineCount: 7, SharedDecoderCount: 5, - SharedEncoderCount: 5, + SharedEncoderCount: 0, SharedJpegCount: 1, SharedOfaCount: 1, }, @@ -301,10 +301,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, SliceCount: 2, InstanceCount: 3, - MultiprocessorCount: 2, - SharedCopyEngineCount: 8, + MultiprocessorCount: 28, + SharedCopyEngineCount: 7, SharedDecoderCount: 5, - SharedEncoderCount: 5, + SharedEncoderCount: 0, SharedJpegCount: 1, SharedOfaCount: 1, }, @@ -312,10 +312,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, SliceCount: 3, InstanceCount: 2, - MultiprocessorCount: 3, - SharedCopyEngineCount: 8, + MultiprocessorCount: 42, + SharedCopyEngineCount: 7, SharedDecoderCount: 5, - SharedEncoderCount: 5, + SharedEncoderCount: 0, SharedJpegCount: 1, SharedOfaCount: 1, }, @@ -323,10 +323,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, SliceCount: 4, InstanceCount: 1, - MultiprocessorCount: 4, - SharedCopyEngineCount: 8, + MultiprocessorCount: 56, + SharedCopyEngineCount: 7, SharedDecoderCount: 5, - SharedEncoderCount: 5, + SharedEncoderCount: 0, SharedJpegCount: 1, SharedOfaCount: 1, }, @@ -334,10 +334,10 @@ var MIGProfiles = struct { Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, SliceCount: 7, InstanceCount: 1, - MultiprocessorCount: 7, - SharedCopyEngineCount: 8, + MultiprocessorCount: 98, + SharedCopyEngineCount: 7, SharedDecoderCount: 5, - SharedEncoderCount: 5, + SharedEncoderCount: 0, SharedJpegCount: 1, SharedOfaCount: 1, }, From ba424328e70476ac32f30e73bfc9e9da047e142d Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Tue, 23 Apr 2024 10:44:36 +0000 Subject: [PATCH 2/7] Add global variable to hold placement information for all GIs and CIs For now this only holds GI placement information with placeholders for the CI placement informatin. It should need to be extended to hold CI placement information in the future. Signed-off-by: Kevin Klues --- pkg/nvml/mock/dgxa100/dgxa100.go | 155 +++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index 90e7efb..956093f 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -345,6 +345,161 @@ var MIGProfiles = struct { }, } +var MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement +}{ + GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + { + Start: 6, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + { + Start: 0, + Size: 4, + }, + { + Start: 4, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + { + Start: 0, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + { + Start: 0, + Size: 8, + }, + }, + }, + // TODO: Fill out ComputeInstancePossiblePlacements + ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, + }, + }, +} + func New() nvml.Interface { return &Server{ Devices: [8]nvml.Device{ From b394877e9a8c9bd2b406996bfdba1ee2b6f85669 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Tue, 23 Apr 2024 11:03:13 +0000 Subject: [PATCH 3/7] Move MIGProfiles and MIGPlacements variables to their own file Signed-off-by: Kevin Klues --- pkg/nvml/mock/dgxa100/dgxa100.go | 446 ------------------------- pkg/nvml/mock/dgxa100/mig-profile.go | 471 +++++++++++++++++++++++++++ 2 files changed, 471 insertions(+), 446 deletions(-) create mode 100644 pkg/nvml/mock/dgxa100/mig-profile.go diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index 956093f..2388c3e 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -54,452 +54,6 @@ var _ nvml.Device = (*Device)(nil) var _ nvml.GpuInstance = (*GpuInstance)(nil) var _ nvml.ComputeInstance = (*ComputeInstance)(nil) -var MIGProfiles = struct { - GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo - ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo -}{ - GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 0, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, - IsP2pSupported: 0, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - CopyEngineCount: 2, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, - IsP2pSupported: 0, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - CopyEngineCount: 3, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, - IsP2pSupported: 0, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - CopyEngineCount: 4, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, - IsP2pSupported: 0, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - CopyEngineCount: 7, - DecoderCount: 5, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 40192, - }, - }, - ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 0, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 2, - MultiprocessorCount: 14, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 3, - MultiprocessorCount: 14, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 1, - MultiprocessorCount: 42, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 2, - MultiprocessorCount: 28, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - }, -} - -var MIGPlacements = struct { - GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement - ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement -}{ - GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - { - Start: 6, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - { - Start: 0, - Size: 4, - }, - { - Start: 4, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - { - Start: 0, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - { - Start: 0, - Size: 8, - }, - }, - }, - // TODO: Fill out ComputeInstancePossiblePlacements - ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, - }, - }, -} - func New() nvml.Interface { return &Server{ Devices: [8]nvml.Device{ diff --git a/pkg/nvml/mock/dgxa100/mig-profile.go b/pkg/nvml/mock/dgxa100/mig-profile.go new file mode 100644 index 0000000..c4df4c8 --- /dev/null +++ b/pkg/nvml/mock/dgxa100/mig-profile.go @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxa100 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// MIGProfiles holds the profile information for GIs and CIs in this mock server. +// We should consider auto-generating this object in the future. +var MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo +}{ + GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 0, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 40192, + }, + }, + ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 0, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + SharedCopyEngineCount: 2, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + SharedCopyEngineCount: 2, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 14, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 42, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + }, + }, +} + +// MIGPlacements holds the placement information for GIs and CIs in this mock server. +// We should consider auto-generating this object in the future. +var MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement +}{ + GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + { + Start: 6, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + { + Start: 0, + Size: 4, + }, + { + Start: 4, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + { + Start: 0, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + { + Start: 0, + Size: 8, + }, + }, + }, + // TODO: Fill out ComputeInstancePossiblePlacements + ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, + }, + }, +} From 37bdc54d67606e82a7c0789905164f532007361f Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Tue, 23 Apr 2024 11:04:41 +0000 Subject: [PATCH 4/7] Add additional functionality to the mock dgxa100 server Signed-off-by: Kevin Klues --- pkg/nvml/mock/dgxa100/dgxa100.go | 94 ++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 12 deletions(-) diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index 2388c3e..a042435 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -26,29 +26,44 @@ import ( type Server struct { mock.Interface - Devices [8]nvml.Device + Devices [8]nvml.Device + DriverVersion string + NvmlVersion string + CudaDriverVersion int } type Device struct { mock.Device - UUID string - PciBusID string - Index int - MigMode int - GpuInstances map[*GpuInstance]struct{} - GpuInstanceCounter uint32 - MemoryInfo nvml.Memory + UUID string + Name string + Brand nvml.BrandType + Architecture nvml.DeviceArchitecture + PciBusID string + Minor int + Index int + CudaComputeCapability CudaComputeCapability + MigMode int + GpuInstances map[*GpuInstance]struct{} + GpuInstanceCounter uint32 + MemoryInfo nvml.Memory } + type GpuInstance struct { mock.GpuInstance Info nvml.GpuInstanceInfo ComputeInstances map[*ComputeInstance]struct{} ComputeInstanceCounter uint32 } + type ComputeInstance struct { mock.ComputeInstance Info nvml.ComputeInstanceInfo } +type CudaComputeCapability struct { + Major int + Minor int +} + var _ nvml.Interface = (*Server)(nil) var _ nvml.Device = (*Device)(nil) var _ nvml.GpuInstance = (*GpuInstance)(nil) @@ -66,14 +81,25 @@ func New() nvml.Interface { NewDevice(6), NewDevice(7), }, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, } } func NewDevice(index int) nvml.Device { return &Device{ - UUID: "GPU-" + uuid.New().String(), - PciBusID: fmt.Sprintf("0000:%02x:00.0", index), - Index: index, + UUID: "GPU-" + uuid.New().String(), + Name: "Mock NVIDIA A100-SXM4-40GB", + Brand: nvml.BRAND_NVIDIA, + Architecture: nvml.DEVICE_ARCH_AMPERE, + PciBusID: fmt.Sprintf("0000:%02x:00.0", index), + Minor: index, + Index: index, + CudaComputeCapability: CudaComputeCapability{ + Major: 8, + Minor: 0, + }, GpuInstances: make(map[*GpuInstance]struct{}), GpuInstanceCounter: 0, MemoryInfo: nvml.Memory{42949672960, 0, 0}, @@ -94,6 +120,14 @@ func NewComputeInstance(info nvml.ComputeInstanceInfo) nvml.ComputeInstance { } } +func (n *Server) Extensions() nvml.ExtendedInterface { + return n +} + +func (n *Server) LookupSymbol(symbol string) error { + return nil +} + func (n *Server) Init() nvml.Return { return nvml.SUCCESS } @@ -102,8 +136,16 @@ func (n *Server) Shutdown() nvml.Return { return nvml.SUCCESS } +func (n *Server) SystemGetDriverVersion() (string, nvml.Return) { + return n.DriverVersion, nvml.SUCCESS +} + func (n *Server) SystemGetNVMLVersion() (string, nvml.Return) { - return "11.450.51", nvml.SUCCESS + return n.NvmlVersion, nvml.SUCCESS +} + +func (n *Server) SystemGetCudaDriverVersion() (int, nvml.Return) { + return n.CudaDriverVersion, nvml.SUCCESS } func (n *Server) DeviceGetCount() (int, nvml.Return) { @@ -135,14 +177,34 @@ func (n *Server) DeviceGetHandleByPciBusId(busID string) (nvml.Device, nvml.Retu return nil, nvml.ERROR_INVALID_ARGUMENT } +func (d *Device) GetMinorNumber() (int, nvml.Return) { + return d.Minor, nvml.SUCCESS +} + func (d *Device) GetIndex() (int, nvml.Return) { return d.Index, nvml.SUCCESS } +func (d *Device) GetCudaComputeCapability() (int, int, nvml.Return) { + return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS +} + func (d *Device) GetUUID() (string, nvml.Return) { return d.UUID, nvml.SUCCESS } +func (d *Device) GetName() (string, nvml.Return) { + return d.Name, nvml.SUCCESS +} + +func (d *Device) GetBrand() (nvml.BrandType, nvml.Return) { + return d.Brand, nvml.SUCCESS +} + +func (d *Device) GetArchitecture() (nvml.DeviceArchitecture, nvml.Return) { + return d.Architecture, nvml.SUCCESS +} + func (d *Device) GetMemoryInfo() (nvml.Memory, nvml.Return) { return d.MemoryInfo, nvml.SUCCESS } @@ -175,6 +237,10 @@ func (d *Device) GetGpuInstanceProfileInfo(giProfileId int) (nvml.GpuInstancePro return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS } +func (d *Device) GetGpuInstancePossiblePlacements(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { + return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS +} + func (d *Device) CreateGpuInstance(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { giInfo := nvml.GpuInstanceInfo{ Device: d, @@ -236,6 +302,10 @@ func (gi *GpuInstance) GetComputeInstanceProfileInfo(ciProfileId int, ciEngProfi return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS } +func (gi *GpuInstance) GetComputeInstancePossiblePlacements(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { + return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS +} + func (gi *GpuInstance) CreateComputeInstance(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { ciInfo := nvml.ComputeInstanceInfo{ Device: gi.Info.Device, From 93fa13d09800b38b6c49e5a9dcfebb93d6b8cb40 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Tue, 23 Apr 2024 12:29:25 +0000 Subject: [PATCH 5/7] Assign dgxa100 mock methods as function pointers instead of overwriting Signed-off-by: Kevin Klues --- pkg/nvml/mock/dgxa100/dgxa100.go | 355 ++++++++++++++++--------------- 1 file changed, 186 insertions(+), 169 deletions(-) diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index a042435..e15121b 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -26,6 +26,7 @@ import ( type Server struct { mock.Interface + mock.ExtendedInterface Devices [8]nvml.Device DriverVersion string NvmlVersion string @@ -70,7 +71,7 @@ var _ nvml.GpuInstance = (*GpuInstance)(nil) var _ nvml.ComputeInstance = (*ComputeInstance)(nil) func New() nvml.Interface { - return &Server{ + server := &Server{ Devices: [8]nvml.Device{ NewDevice(0), NewDevice(1), @@ -85,10 +86,12 @@ func New() nvml.Interface { NvmlVersion: "12.550.54.15", CudaDriverVersion: 12040, } + server.setMockFuncs() + return server } func NewDevice(index int) nvml.Device { - return &Device{ + device := &Device{ UUID: "GPU-" + uuid.New().String(), Name: "Mock NVIDIA A100-SXM4-40GB", Brand: nvml.BRAND_NVIDIA, @@ -104,241 +107,255 @@ func NewDevice(index int) nvml.Device { GpuInstanceCounter: 0, MemoryInfo: nvml.Memory{42949672960, 0, 0}, } + device.setMockFuncs() + return device } func NewGpuInstance(info nvml.GpuInstanceInfo) nvml.GpuInstance { - return &GpuInstance{ + gi := &GpuInstance{ Info: info, ComputeInstances: make(map[*ComputeInstance]struct{}), ComputeInstanceCounter: 0, } + gi.setMockFuncs() + return gi } func NewComputeInstance(info nvml.ComputeInstanceInfo) nvml.ComputeInstance { - return &ComputeInstance{ + ci := &ComputeInstance{ Info: info, } + ci.setMockFuncs() + return ci } -func (n *Server) Extensions() nvml.ExtendedInterface { - return n -} - -func (n *Server) LookupSymbol(symbol string) error { - return nil -} +func (s *Server) setMockFuncs() { + s.ExtensionsFunc = func() nvml.ExtendedInterface { + return s + } -func (n *Server) Init() nvml.Return { - return nvml.SUCCESS -} + s.LookupSymbolFunc = func(symbol string) error { + return nil + } -func (n *Server) Shutdown() nvml.Return { - return nvml.SUCCESS -} + s.InitFunc = func() nvml.Return { + return nvml.SUCCESS + } -func (n *Server) SystemGetDriverVersion() (string, nvml.Return) { - return n.DriverVersion, nvml.SUCCESS -} + s.ShutdownFunc = func() nvml.Return { + return nvml.SUCCESS + } -func (n *Server) SystemGetNVMLVersion() (string, nvml.Return) { - return n.NvmlVersion, nvml.SUCCESS -} + s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return s.DriverVersion, nvml.SUCCESS + } -func (n *Server) SystemGetCudaDriverVersion() (int, nvml.Return) { - return n.CudaDriverVersion, nvml.SUCCESS -} + s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { + return s.NvmlVersion, nvml.SUCCESS + } -func (n *Server) DeviceGetCount() (int, nvml.Return) { - return len(n.Devices), nvml.SUCCESS -} + s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { + return s.CudaDriverVersion, nvml.SUCCESS + } -func (n *Server) DeviceGetHandleByIndex(index int) (nvml.Device, nvml.Return) { - if index < 0 || index >= len(n.Devices) { - return nil, nvml.ERROR_INVALID_ARGUMENT + s.DeviceGetCountFunc = func() (int, nvml.Return) { + return len(s.Devices), nvml.SUCCESS } - return n.Devices[index], nvml.SUCCESS -} -func (n *Server) DeviceGetHandleByUUID(uuid string) (nvml.Device, nvml.Return) { - for _, d := range n.Devices { - if uuid == d.(*Device).UUID { - return d, nvml.SUCCESS + s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { + if index < 0 || index >= len(s.Devices) { + return nil, nvml.ERROR_INVALID_ARGUMENT } + return s.Devices[index], nvml.SUCCESS } - return nil, nvml.ERROR_INVALID_ARGUMENT -} -func (n *Server) DeviceGetHandleByPciBusId(busID string) (nvml.Device, nvml.Return) { - for _, d := range n.Devices { - if busID == d.(*Device).PciBusID { - return d, nvml.SUCCESS + s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if uuid == d.(*Device).UUID { + return d, nvml.SUCCESS + } } + return nil, nvml.ERROR_INVALID_ARGUMENT } - return nil, nvml.ERROR_INVALID_ARGUMENT -} - -func (d *Device) GetMinorNumber() (int, nvml.Return) { - return d.Minor, nvml.SUCCESS -} -func (d *Device) GetIndex() (int, nvml.Return) { - return d.Index, nvml.SUCCESS + s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if busID == d.(*Device).PciBusID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } } -func (d *Device) GetCudaComputeCapability() (int, int, nvml.Return) { - return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS -} +func (d *Device) setMockFuncs() { + d.GetMinorNumberFunc = func() (int, nvml.Return) { + return d.Minor, nvml.SUCCESS + } -func (d *Device) GetUUID() (string, nvml.Return) { - return d.UUID, nvml.SUCCESS -} + d.GetIndexFunc = func() (int, nvml.Return) { + return d.Index, nvml.SUCCESS + } -func (d *Device) GetName() (string, nvml.Return) { - return d.Name, nvml.SUCCESS -} + d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { + return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS + } -func (d *Device) GetBrand() (nvml.BrandType, nvml.Return) { - return d.Brand, nvml.SUCCESS -} + d.GetUUIDFunc = func() (string, nvml.Return) { + return d.UUID, nvml.SUCCESS + } -func (d *Device) GetArchitecture() (nvml.DeviceArchitecture, nvml.Return) { - return d.Architecture, nvml.SUCCESS -} + d.GetNameFunc = func() (string, nvml.Return) { + return d.Name, nvml.SUCCESS + } -func (d *Device) GetMemoryInfo() (nvml.Memory, nvml.Return) { - return d.MemoryInfo, nvml.SUCCESS -} + d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { + return d.Brand, nvml.SUCCESS + } -func (d *Device) GetPciInfo() (nvml.PciInfo, nvml.Return) { - p := nvml.PciInfo{ - PciDeviceId: 0x20B010DE, + d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { + return d.Architecture, nvml.SUCCESS } - return p, nvml.SUCCESS -} -func (d *Device) SetMigMode(mode int) (nvml.Return, nvml.Return) { - d.MigMode = mode - return nvml.SUCCESS, nvml.SUCCESS -} + d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { + return d.MemoryInfo, nvml.SUCCESS + } -func (d *Device) GetMigMode() (int, int, nvml.Return) { - return d.MigMode, d.MigMode, nvml.SUCCESS -} + d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { + p := nvml.PciInfo{ + PciDeviceId: 0x20B010DE, + } + return p, nvml.SUCCESS + } -func (d *Device) GetGpuInstanceProfileInfo(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { - if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { + d.MigMode = mode + return nvml.SUCCESS, nvml.SUCCESS } - if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + d.GetMigModeFunc = func() (int, int, nvml.Return) { + return d.MigMode, d.MigMode, nvml.SUCCESS } - return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS -} + d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { + if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } -func (d *Device) GetGpuInstancePossiblePlacements(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { - return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS -} + if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } -func (d *Device) CreateGpuInstance(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, + return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi.(*GpuInstance)] = struct{}{} - return gi, nvml.SUCCESS -} -func (d *Device) CreateGpuInstanceWithPlacement(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, - Placement: *placement, - } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi.(*GpuInstance)] = struct{}{} - return gi, nvml.SUCCESS -} + d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { + return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS + } -func (d *Device) GetGpuInstances(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { - var gis []nvml.GpuInstance - for gi := range d.GpuInstances { - if gi.Info.ProfileId == info.Id { - gis = append(gis, gi) + d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, } + d.GpuInstanceCounter++ + gi := NewGpuInstance(giInfo) + d.GpuInstances[gi.(*GpuInstance)] = struct{}{} + return gi, nvml.SUCCESS } - return gis, nvml.SUCCESS -} -func (gi *GpuInstance) GetInfo() (nvml.GpuInstanceInfo, nvml.Return) { - return gi.Info, nvml.SUCCESS -} + d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + Placement: *placement, + } + d.GpuInstanceCounter++ + gi := NewGpuInstance(giInfo) + d.GpuInstances[gi.(*GpuInstance)] = struct{}{} + return gi, nvml.SUCCESS + } -func (gi *GpuInstance) GetComputeInstanceProfileInfo(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { - if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { + var gis []nvml.GpuInstance + for gi := range d.GpuInstances { + if gi.Info.ProfileId == info.Id { + gis = append(gis, gi) + } + } + return gis, nvml.SUCCESS } +} - if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED +func (gi *GpuInstance) setMockFuncs() { + gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { + return gi.Info, nvml.SUCCESS } - giProfileId := int(gi.Info.ProfileId) + gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { + if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } + giProfileId := int(gi.Info.ProfileId) - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } + if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } - return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS -} + if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } -func (gi *GpuInstance) GetComputeInstancePossiblePlacements(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { - return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS -} + return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS + } -func (gi *GpuInstance) CreateComputeInstance(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { - ciInfo := nvml.ComputeInstanceInfo{ - Device: gi.Info.Device, - GpuInstance: gi, - Id: gi.ComputeInstanceCounter, - ProfileId: info.Id, - } - gi.ComputeInstanceCounter++ - ci := NewComputeInstance(ciInfo) - gi.ComputeInstances[ci.(*ComputeInstance)] = struct{}{} - return ci, nvml.SUCCESS -} + gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { + return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS + } -func (gi *GpuInstance) GetComputeInstances(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { - var cis []nvml.ComputeInstance - for ci := range gi.ComputeInstances { - if ci.Info.ProfileId == info.Id { - cis = append(cis, ci) + gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { + ciInfo := nvml.ComputeInstanceInfo{ + Device: gi.Info.Device, + GpuInstance: gi, + Id: gi.ComputeInstanceCounter, + ProfileId: info.Id, } + gi.ComputeInstanceCounter++ + ci := NewComputeInstance(ciInfo) + gi.ComputeInstances[ci.(*ComputeInstance)] = struct{}{} + return ci, nvml.SUCCESS } - return cis, nvml.SUCCESS -} -func (gi *GpuInstance) Destroy() nvml.Return { - delete(gi.Info.Device.(*Device).GpuInstances, gi) - return nvml.SUCCESS -} + gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { + var cis []nvml.ComputeInstance + for ci := range gi.ComputeInstances { + if ci.Info.ProfileId == info.Id { + cis = append(cis, ci) + } + } + return cis, nvml.SUCCESS + } -func (ci *ComputeInstance) GetInfo() (nvml.ComputeInstanceInfo, nvml.Return) { - return ci.Info, nvml.SUCCESS + gi.DestroyFunc = func() nvml.Return { + delete(gi.Info.Device.(*Device).GpuInstances, gi) + return nvml.SUCCESS + } } -func (ci *ComputeInstance) Destroy() nvml.Return { - delete(ci.Info.GpuInstance.(*GpuInstance).ComputeInstances, ci) - return nvml.SUCCESS +func (ci *ComputeInstance) setMockFuncs() { + ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { + return ci.Info, nvml.SUCCESS + } + + ci.DestroyFunc = func() nvml.Return { + delete(ci.Info.GpuInstance.(*GpuInstance).ComputeInstances, ci) + return nvml.SUCCESS + } } From 5e1cdb1ccd97976e81b418175a0f51172120b640 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Tue, 23 Apr 2024 13:03:48 +0000 Subject: [PATCH 6/7] Return concrete types from mock dgxa100 server instead of interfaces This way the callers can extend these types to futher override their functions if desired, while still being able to assign them to the appropriate nvml interfaces. Signed-off-by: Kevin Klues --- pkg/nvml/mock/dgxa100/dgxa100.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index e15121b..9cb5348 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -70,7 +70,7 @@ var _ nvml.Device = (*Device)(nil) var _ nvml.GpuInstance = (*GpuInstance)(nil) var _ nvml.ComputeInstance = (*ComputeInstance)(nil) -func New() nvml.Interface { +func New() *Server { server := &Server{ Devices: [8]nvml.Device{ NewDevice(0), @@ -90,7 +90,7 @@ func New() nvml.Interface { return server } -func NewDevice(index int) nvml.Device { +func NewDevice(index int) *Device { device := &Device{ UUID: "GPU-" + uuid.New().String(), Name: "Mock NVIDIA A100-SXM4-40GB", @@ -111,7 +111,7 @@ func NewDevice(index int) nvml.Device { return device } -func NewGpuInstance(info nvml.GpuInstanceInfo) nvml.GpuInstance { +func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance { gi := &GpuInstance{ Info: info, ComputeInstances: make(map[*ComputeInstance]struct{}), @@ -121,7 +121,7 @@ func NewGpuInstance(info nvml.GpuInstanceInfo) nvml.GpuInstance { return gi } -func NewComputeInstance(info nvml.ComputeInstanceInfo) nvml.ComputeInstance { +func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance { ci := &ComputeInstance{ Info: info, } @@ -261,7 +261,7 @@ func (d *Device) setMockFuncs() { } d.GpuInstanceCounter++ gi := NewGpuInstance(giInfo) - d.GpuInstances[gi.(*GpuInstance)] = struct{}{} + d.GpuInstances[gi] = struct{}{} return gi, nvml.SUCCESS } @@ -274,7 +274,7 @@ func (d *Device) setMockFuncs() { } d.GpuInstanceCounter++ gi := NewGpuInstance(giInfo) - d.GpuInstances[gi.(*GpuInstance)] = struct{}{} + d.GpuInstances[gi] = struct{}{} return gi, nvml.SUCCESS } @@ -329,7 +329,7 @@ func (gi *GpuInstance) setMockFuncs() { } gi.ComputeInstanceCounter++ ci := NewComputeInstance(ciInfo) - gi.ComputeInstances[ci.(*ComputeInstance)] = struct{}{} + gi.ComputeInstances[ci] = struct{}{} return ci, nvml.SUCCESS } From 6895ece75d795098706363b29445421d4c86a3cd Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Tue, 23 Apr 2024 15:33:58 +0000 Subject: [PATCH 7/7] Add Mutexes to mock dgxa100 types to avoid concurrent maps reads/writes Signed-off-by: Kevin Klues --- pkg/nvml/mock/dgxa100/dgxa100.go | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index 9cb5348..7654dc7 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -18,6 +18,7 @@ package dgxa100 import ( "fmt" + "sync" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/go-nvml/pkg/nvml/mock" @@ -34,6 +35,7 @@ type Server struct { } type Device struct { mock.Device + sync.RWMutex UUID string Name string Brand nvml.BrandType @@ -50,6 +52,7 @@ type Device struct { type GpuInstance struct { mock.GpuInstance + sync.RWMutex Info nvml.GpuInstanceInfo ComputeInstances map[*ComputeInstance]struct{} ComputeInstanceCounter uint32 @@ -254,6 +257,8 @@ func (d *Device) setMockFuncs() { } d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() giInfo := nvml.GpuInstanceInfo{ Device: d, Id: d.GpuInstanceCounter, @@ -266,6 +271,8 @@ func (d *Device) setMockFuncs() { } d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() giInfo := nvml.GpuInstanceInfo{ Device: d, Id: d.GpuInstanceCounter, @@ -279,6 +286,8 @@ func (d *Device) setMockFuncs() { } d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { + d.RLock() + defer d.RUnlock() var gis []nvml.GpuInstance for gi := range d.GpuInstances { if gi.Info.ProfileId == info.Id { @@ -321,6 +330,8 @@ func (gi *GpuInstance) setMockFuncs() { } gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { + gi.Lock() + defer gi.Unlock() ciInfo := nvml.ComputeInstanceInfo{ Device: gi.Info.Device, GpuInstance: gi, @@ -334,6 +345,8 @@ func (gi *GpuInstance) setMockFuncs() { } gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { + gi.RLock() + defer gi.RUnlock() var cis []nvml.ComputeInstance for ci := range gi.ComputeInstances { if ci.Info.ProfileId == info.Id { @@ -344,7 +357,10 @@ func (gi *GpuInstance) setMockFuncs() { } gi.DestroyFunc = func() nvml.Return { - delete(gi.Info.Device.(*Device).GpuInstances, gi) + d := gi.Info.Device.(*Device) + d.Lock() + defer d.Unlock() + delete(d.GpuInstances, gi) return nvml.SUCCESS } } @@ -355,7 +371,10 @@ func (ci *ComputeInstance) setMockFuncs() { } ci.DestroyFunc = func() nvml.Return { - delete(ci.Info.GpuInstance.(*GpuInstance).ComputeInstances, ci) + gi := ci.Info.GpuInstance.(*GpuInstance) + gi.Lock() + defer gi.Unlock() + delete(gi.ComputeInstances, ci) return nvml.SUCCESS } }