From 463e54c16415dff7a29bf5fe6e04c3f51d3a5dcd Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 10 Jul 2024 17:47:51 +0800 Subject: [PATCH] feat(components/nvidia): initial commit Signed-off-by: Gyuho Lee --- .golangci.yml | 36 + cmd/leptond/command/main.go | 9 +- components/accelerator/nvidia/info/info.go | 149 ++ components/accelerator/nvidia/nvidia.go | 13 + components/accelerator/nvidia/query/gpu.go | 211 +++ .../accelerator/nvidia/query/gpu_test.go | 88 + .../accelerator/nvidia/query/querier.go | 128 ++ .../accelerator/nvidia/query/querier_test.go | 72 + components/accelerator/nvidia/query/query.go | 348 ++++ .../accelerator/nvidia/query/query_test.go | 275 +++ .../nvidia-smi-query.535.154.05.out.0.valid | 1608 ++++++++++++++++ .../nvidia-smi-query.535.154.05.out.1.valid | 1580 ++++++++++++++++ .../nvidia-smi-query.535.154.05.out.2.valid | 1584 ++++++++++++++++ .../nvidia-smi-query.535.183.01.out.0.invalid | 91 + .../nvidia-smi-query.535.183.01.out.0.valid | 1652 +++++++++++++++++ components/components.go | 13 + go.mod | 11 + go.sum | 52 + internal/server/config/config.go | 3 + internal/server/config/config_test.go | 17 + internal/server/config/testdata/test.0.yaml | 9 + internal/server/server.go | 27 +- 22 files changed, 7968 insertions(+), 8 deletions(-) create mode 100644 .golangci.yml create mode 100644 components/accelerator/nvidia/info/info.go create mode 100644 components/accelerator/nvidia/nvidia.go create mode 100644 components/accelerator/nvidia/query/gpu.go create mode 100644 components/accelerator/nvidia/query/gpu_test.go create mode 100644 components/accelerator/nvidia/query/querier.go create mode 100644 components/accelerator/nvidia/query/querier_test.go create mode 100644 components/accelerator/nvidia/query/query.go create mode 100644 components/accelerator/nvidia/query/query_test.go create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.0.valid create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.1.valid create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.2.valid create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.invalid create mode 100644 components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.valid create mode 100644 internal/server/config/config_test.go create mode 100644 internal/server/config/testdata/test.0.yaml diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 00000000..4e63fe9c --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,36 @@ +# https://golangci-lint.run/usage/configuration/ +run: + concurrency: 4 + timeout: 15m + + # include test files or not, default is true + tests: true + +linters-settings: + gofmt: + # simplify code: gofmt with `-s` option, true by default + simplify: true + goimports: + # put imports beginning with prefix after 3rd-party packages; + # it's a comma-separated list of prefixes + local-prefixes: lepton.ai/lepton + misspell: + # Correct spellings using locale preferences for US or UK. + # Default is to use a neutral variety of English. + # Setting locale to US will correct the British spelling of 'colour' to 'color'. + locale: US + +linters: + fast: false + disable-all: true + enable: + - errcheck + - gosimple + - govet + - ineffassign + - staticcheck + - unused + - gofmt + - goimports + - misspell + - unconvert diff --git a/cmd/leptond/command/main.go b/cmd/leptond/command/main.go index 86ed245e..9d3f2ac2 100644 --- a/cmd/leptond/command/main.go +++ b/cmd/leptond/command/main.go @@ -35,7 +35,8 @@ func App() *cli.App { }, } app.Action = func(cliContext *cli.Context) error { - ctx := context.Background() + rootCtx, rootCancel := context.WithCancel(context.Background()) + defer rootCancel() start := time.Now() config := defaultConfig() signals := make(chan os.Signal, 2048) @@ -54,18 +55,18 @@ func App() *cli.App { log.Logger.Infof("starting leptond %v: %v", version.Version, version.Version) - done := handleSignals(ctx, signals, serverC) + done := handleSignals(rootCtx, signals, serverC) // start the signal handler as soon as we can to make sure that // we don't miss any signals during boot signal.Notify(signals, handledSignals...) - server, err := lepServer.New(config) + server, err := lepServer.New(rootCtx, config) if err != nil { return err } serverC <- server - if err := notifyReady(ctx); err != nil { + if err := notifyReady(rootCtx); err != nil { log.Logger.Warn("notify ready failed") } log.Logger.Infof("successfully booted in %fs", time.Since(start).Seconds()) diff --git a/components/accelerator/nvidia/info/info.go b/components/accelerator/nvidia/info/info.go new file mode 100644 index 00000000..a7f077d0 --- /dev/null +++ b/components/accelerator/nvidia/info/info.go @@ -0,0 +1,149 @@ +// Package info implements static information display. +package info + +import ( + "context" + "encoding/json" + "net/http" + "strconv" + "time" + + "github.com/leptonai/leptond/components" + nvidiaquery "github.com/leptonai/leptond/components/accelerator/nvidia/query" +) + +const ( + // Serves relatively static information about the NVIDIA accelerator. + Name = "accelerator-nvidia-info" + + StateKeyDriver = "driver" + StateKeyCUDA = "cuda" + StateKeyGPU = "gpu" + StateKeyProduct = "product" + StateKeyProductName = "name" + StateKeyProductBrand = "brand" + StateKeyProductArchitecture = "architecture" +) + +func New(ctx context.Context, queryInterval time.Duration) components.Component { + cctx, ccancel := context.WithCancel(ctx) + nvidiaquery.DefaultQuerier.Start(cctx, queryInterval) + return &component{ + rootCtx: ctx, + cancel: ccancel, + querier: nvidiaquery.DefaultQuerier, + } +} + +var _ components.Component = (*component)(nil) + +type component struct { + rootCtx context.Context + cancel context.CancelFunc + querier nvidiaquery.Querier +} + +func (c *component) Name() string { return Name } + +// in case we want to update the interval +// TODO: integrate with the component interface +func (c *component) Apply(cfg any) error { + qcfg, err := nvidiaquery.ParseQuerierConfig(cfg) + if err != nil { + return err + } + + // stop to apply the config + // stopping querier is fine + // because it still serves the last request + c.cancel() + c.querier.Stop() + + ctx, cancel := context.WithCancel(c.rootCtx) + c.cancel = cancel + c.querier.Start(ctx, qcfg.Interval) + return nil +} + +func (c *component) State(ctx context.Context) ([]components.State, error) { + last := c.querier.Last() + if last.Error != nil { + return []components.State{ + { + Healthy: false, + Error: last.Error, + Reason: "last query failed", + }, + }, nil + } + if last.Output == nil { + return []components.State{ + { + Healthy: false, + Reason: "no output", + }, + }, nil + } + + cs := []components.State{ + { + Name: StateKeyDriver, + Healthy: true, + ExtraInfo: map[string]string{ + "version": last.Output.DriverVersion, + }, + }, + { + Name: StateKeyCUDA, + Healthy: true, + ExtraInfo: map[string]string{ + "version": last.Output.CUDAVersion, + }, + }, + { + Name: StateKeyGPU, + Healthy: true, + ExtraInfo: map[string]string{ + "attached": strconv.Itoa(last.Output.AttachedGPUs), + }, + }, + } + if len(last.Output.GPUs) > 0 { + cs = append(cs, components.State{ + Name: StateKeyProduct, + Healthy: true, + ExtraInfo: map[string]string{ + StateKeyProductName: last.Output.GPUs[0].ProductName, + StateKeyProductBrand: last.Output.GPUs[0].ProductBrand, + StateKeyProductArchitecture: last.Output.GPUs[0].ProductArchitecture, + }, + }) + } + return cs, nil +} + +func (c *component) SetState(ctx context.Context, states ...components.State) error { + return nil +} + +func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) { + return nil, nil +} + +func (c *component) ServeHTTP(resp http.ResponseWriter, req *http.Request) { + resp.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(resp).Encode(c.querier.All()); err != nil { + http.Error(resp, "failed to encode response", http.StatusInternalServerError) + return + } +} + +func (c *component) HTTPHandler() http.Handler { + return c +} + +func (c *component) Close() error { + // safe to call stop multiple times + c.querier.Stop() + return nil +} diff --git a/components/accelerator/nvidia/nvidia.go b/components/accelerator/nvidia/nvidia.go new file mode 100644 index 00000000..06bbd4f1 --- /dev/null +++ b/components/accelerator/nvidia/nvidia.go @@ -0,0 +1,13 @@ +package nvidia + +import "os/exec" + +// Returns true if the local machine runs on Nvidia GPU +// by running "nvidia-smi". +func SMIExists() bool { + p, err := exec.LookPath("nvidia-smi") + if err != nil { + return false + } + return p != "" +} diff --git a/components/accelerator/nvidia/query/gpu.go b/components/accelerator/nvidia/query/gpu.go new file mode 100644 index 00000000..a251941a --- /dev/null +++ b/components/accelerator/nvidia/query/gpu.go @@ -0,0 +1,211 @@ +package query + +import ( + "errors" + "fmt" + "strconv" + "strings" + + "github.com/dustin/go-humanize" +) + +// ref. "nvidia-smi --help-query-gpu" +type GPU struct { + // The original GPU identifier from the nvidia-smi query output. + // e.g., "GPU 00000000:53:00.0" + ID string `json:"ID"` + + ProductName string `json:"Product Name"` + ProductBrand string `json:"Product Brand"` + ProductArchitecture string `json:"Product Architecture"` + + GPUResetStatus *GPUResetStatus `json:"GPU Reset Status,omitempty"` + ClockEventReasons *ClockEventReasons `json:"Clocks Event Reasons,omitempty"` + ECCErrors *ECCErrors `json:"ECC Errors,omitempty"` + Temperature *GPUTemperature `json:"Temperature,omitempty"` + GPUPowerReadings *GPUPowerReadings `json:"GPU Power Readings,omitempty"` + Processes *Processes `json:"Processes,omitempty"` + FBMemoryUsage *FBMemoryUsage `json:"FB Memory Usage"` +} + +type GPUResetStatus struct { + ResetRequired string `json:"Reset Required"` + DrainAndResetRecommended string `json:"Drain and Reset Recommended"` +} + +type ClockEventReasons struct { + SWPowerCap string `json:"SW Power Cap"` + SWThermalSlowdown string `json:"SW Thermal Slowdown"` + HWSlowdown string `json:"HW Slowdown"` + HWThermalSlowdown string `json:"HW Thermal Slowdown"` + HWPowerBrakeSlowdown string `json:"HW Power Brake Slowdown"` +} + +type ECCErrors struct { + Volatile *ECCErrorVolatile `json:"Volatile,omitempty"` + Aggregate *ECCErrorAggregate `json:"Aggregate,omitempty"` + AggregateUncorrectableSRAMSources *ECCErrorAggregateUncorrectableSRAMSources `json:"Aggregate Uncorrectable SRAM Sources,omitempty"` +} + +type ECCErrorVolatile struct { + SRAMCorrectable string `json:"SRAM Correctable"` + SRAMUncorrectable string `json:"SRAM Uncorrectable"` + SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"` // for newer driver versions + SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions + + DRAMCorrectable string `json:"DRAM Correctable"` + DRAMUncorrectable string `json:"DRAM Uncorrectable"` +} + +type ECCErrorAggregate struct { + SRAMCorrectable string `json:"SRAM Correctable"` + SRAMUncorrectable string `json:"SRAM Uncorrectable"` + SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"` // for newer driver versions + SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions + + DRAMCorrectable string `json:"DRAM Correctable"` + DRAMUncorrectable string `json:"DRAM Uncorrectable"` + + SRAMThresholdExceeded string `json:"SRAM Threshold Exceeded"` +} + +type ECCErrorAggregateUncorrectableSRAMSources struct { + SRAML2 string `json:"SRAM L2"` + SRAMSM string `json:"SRAM SM"` + SRAMMicrocontroller string `json:"SRAM Microcontroller"` + SRAMPCIE string `json:"SRAM PCIE"` + SRAMOther string `json:"SRAM Other"` +} + +// If any field shows "Unknown Error", it means GPU has some issues. +type GPUTemperature struct { + Current string `json:"GPU Current Temp"` + Limit string `json:"GPU T.Limit Temp"` + + ShutdownLimit string `json:"GPU Shutdown T.Limit Temp"` + SlowdownLimit string `json:"GPU Slowdown T.Limit Temp"` + MaxOperatingLimit string `json:"GPU Max Operating T.Limit Temp"` + + // this value is not reliable to monitor as it's often N/A + Target string `json:"GPU Target Temperature"` + + MemoryCurrent string `json:"Memory Current Temp"` + MemoryMaxOperatingLimit string `json:"Memory Max Operating T.Limit Temp"` +} + +func (tm *GPUTemperature) GetCurrentCelsius() (float64, error) { + v := tm.Current + if v == "N/A" { + return 0.0, errors.New("N/A") + } + + if strings.HasSuffix(v, " C") { + v = strings.TrimSuffix(v, " C") + } else { + return 0.0, fmt.Errorf("invalid GPU current temperature: %s (expected celsius)", tm.Current) + } + + parsed, err := strconv.ParseFloat(v, 64) + if err != nil { + return 0, err + } + return parsed, nil +} + +func (tm *GPUTemperature) GetLimitCelsius() (float64, error) { + v := tm.Limit + if v == "N/A" { + return 0.0, errors.New("N/A") + } + + if strings.HasSuffix(v, " C") { + v = strings.TrimSuffix(v, " C") + } else { + return 0.0, fmt.Errorf("invalid GPU t.limit temperature: %s (expected celsius)", tm.Limit) + } + + parsed, err := strconv.ParseFloat(v, 64) + if err != nil { + return 0, err + } + return parsed, nil +} + +type GPUPowerReadings struct { + PowerDraw string `json:"Power Draw"` + CurrentPowerLimit string `json:"Current Power Limit"` + RequestedPowerLimit string `json:"Requested Power Limit"` + DefaultPowerLimit string `json:"Default Power Limit"` + MinPowerLimit string `json:"Min Power Limit"` + MaxPowerLimit string `json:"Max Power Limit"` +} + +func (g *GPUPowerReadings) GetPowerDrawW() (float64, error) { + v := g.PowerDraw + if v == "N/A" { + return 0.0, errors.New("N/A") + } + + if strings.HasSuffix(v, " W") { + v = strings.TrimSuffix(v, " W") + } else { + return 0.0, fmt.Errorf("invalid power draw: %s (expected watts)", g.PowerDraw) + } + + parsed, err := strconv.ParseFloat(v, 64) + if err != nil { + return 0, err + } + return parsed, nil +} + +func (g *GPUPowerReadings) GetCurrentPowerLimitW() (float64, error) { + v := g.CurrentPowerLimit + if v == "N/A" { + return 0.0, errors.New("N/A") + } + + if strings.HasSuffix(v, " W") { + v = strings.TrimSuffix(v, " W") + } else { + return 0.0, fmt.Errorf("invalid current power limit: %s (expected watts)", g.CurrentPowerLimit) + } + + parsed, err := strconv.ParseFloat(v, 64) + if err != nil { + return 0, err + } + return parsed, nil +} + +type Processes struct { + GPUInstanceID string `json:"GPU instance ID"` + ComputeInstanceID string `json:"Compute instance ID"` + ProcessID int64 `json:"Process ID"` + ProcessType string `json:"Process Type"` + ProcessName string `json:"Process Name"` + ProcessUsedGPUMemory string `json:"Process Used GPU Memory"` +} + +type FBMemoryUsage struct { + Total string `json:"Total"` + Reserved string `json:"Reserved"` + Used string `json:"Used"` + Free string `json:"Free"` +} + +func (f *FBMemoryUsage) GetTotalBytes() (uint64, error) { + return humanize.ParseBytes(f.Total) +} + +func (f *FBMemoryUsage) GetReservedBytes() (uint64, error) { + return humanize.ParseBytes(f.Reserved) +} + +func (f *FBMemoryUsage) GetUsedBytes() (uint64, error) { + return humanize.ParseBytes(f.Used) +} + +func (f *FBMemoryUsage) GetFreeBytes() (uint64, error) { + return humanize.ParseBytes(f.Free) +} diff --git a/components/accelerator/nvidia/query/gpu_test.go b/components/accelerator/nvidia/query/gpu_test.go new file mode 100644 index 00000000..3cf54528 --- /dev/null +++ b/components/accelerator/nvidia/query/gpu_test.go @@ -0,0 +1,88 @@ +package query + +import "testing" + +func TestTemperature(t *testing.T) { + tm := &GPUTemperature{ + Current: "38 C", + Limit: "47 C", + } + + currentTempC, err := tm.GetCurrentCelsius() + if err != nil { + t.Fatalf("error getting current temperature: %v", err) + } + if currentTempC != 38.0 { + t.Fatalf("expected current temperature of 38.0, got %f", currentTempC) + } + + currentLimitTempC, err := tm.GetLimitCelsius() + if err != nil { + t.Fatalf("error getting current limit temperature: %v", err) + } + if currentLimitTempC != 47.0 { + t.Fatalf("expected current limit temperature of 47.0, got %f", currentLimitTempC) + } +} + +func TestGPUPowerReadings(t *testing.T) { + g := &GPUPowerReadings{ + PowerDraw: "71.97 W", + CurrentPowerLimit: "700.00 W", + } + powerDrawW, err := g.GetPowerDrawW() + if err != nil { + t.Fatalf("error getting power draw: %v", err) + } + if powerDrawW != 71.97 { + t.Fatalf("expected power draw of 71.97, got %f", powerDrawW) + } + currentPowerLimitW, err := g.GetCurrentPowerLimitW() + if err != nil { + t.Fatalf("error getting current power limit: %v", err) + } + if currentPowerLimitW != 700 { + t.Fatalf("expected current power limit of 700, got %f", currentPowerLimitW) + } +} + +func TestFBMemoryUsage(t *testing.T) { + f := &FBMemoryUsage{ + Total: "81559 MiB", + Reserved: "551 MiB", + Used: "0 MiB", + Free: "81007 MiB", + } + + totalBytes, err := f.GetTotalBytes() + if err != nil { + t.Fatalf("error getting total bytes: %v", err) + } + if totalBytes != 85520809984 { + t.Fatalf("expected total bytes of 85520809984, got %d", totalBytes) + } + + reservedBytes, err := f.GetReservedBytes() + if err != nil { + t.Fatalf("error getting reserved bytes: %v", err) + } + if reservedBytes != 577765376 { + t.Fatalf("expected reserved bytes of 577765376, got %d", reservedBytes) + } + + usedBytes, err := f.GetUsedBytes() + if err != nil { + t.Fatalf("error getting used bytes: %v", err) + } + if usedBytes != 0 { + t.Fatalf("expected used bytes of 0, got %d", usedBytes) + } + + freeBytes, err := f.GetFreeBytes() + if err != nil { + t.Fatalf("error getting free bytes: %v", err) + } + if freeBytes != 84941996032 { + t.Fatalf("expected free bytes of 84941996032, got %d", freeBytes) + } +} diff --git a/components/accelerator/nvidia/query/querier.go b/components/accelerator/nvidia/query/querier.go new file mode 100644 index 00000000..bd2bacfd --- /dev/null +++ b/components/accelerator/nvidia/query/querier.go @@ -0,0 +1,128 @@ +package query + +import ( + "context" + "encoding/json" + "sync" + "time" + + "github.com/leptonai/leptond/log" +) + +var ( + DefaultQuerier = NewQuerier() +) + +type QuerierConfig struct { + Interval time.Duration `json:"interval"` +} + +func ParseQuerierConfig(b any) (*QuerierConfig, error) { + raw, err := json.Marshal(b) + if err != nil { + return nil, err + } + + cfg := new(QuerierConfig) + err = json.Unmarshal(raw, cfg) + if err != nil { + return nil, err + } + return cfg, nil +} + +// Defines the nvidia-smi querier interface +// so many components can share the same query results. +type Querier interface { + // Starts the querier routine. + // Redundant calls will be skipped if there's an existing querier. + Start(ctx context.Context, interval time.Duration) + // Stops the querier routine. + // Safe to call multiple times. + Stop() + + // Last returns the last result + Last() Result + // All returns all results + All() []Result +} + +const defaultQueueSize = 20 + +func NewQuerier() Querier { + return &querier{ + queueN: defaultQueueSize, + } +} + +type querier struct { + mu sync.Mutex + ctx context.Context + cancel context.CancelFunc + + queueN int + last []Result + lastMu sync.RWMutex +} + +func (q *querier) Start(ctx context.Context, interval time.Duration) { + q.mu.Lock() + defer q.mu.Unlock() + + started := q.ctx != nil + if started { + log.Logger.Warnw("querier already started") + return + } + + log.Logger.Infow("starting querier") + q.ctx, q.cancel = context.WithCancel(ctx) + ch := start(q.ctx, interval) + go func() { + for result := range ch { + q.addResult(result) + } + }() +} + +func (q *querier) addResult(result Result) { + q.lastMu.Lock() + defer q.lastMu.Unlock() + + if len(q.last) >= q.queueN { + q.last = q.last[1:] + } + q.last = append(q.last, result) +} + +func (q *querier) Stop() { + q.mu.Lock() + defer q.mu.Unlock() + + stopped := q.ctx == nil + if stopped { + log.Logger.Warnw("querier already stopped") + return + } + + log.Logger.Infow("stopping querier") + q.cancel() + q.ctx = nil + q.cancel = nil +} + +func (q *querier) Last() Result { + q.lastMu.RLock() + defer q.lastMu.RUnlock() + + if len(q.last) == 0 { + return Result{} + } + return q.last[len(q.last)-1] +} + +func (q *querier) All() []Result { + q.lastMu.RLock() + defer q.lastMu.RUnlock() + return q.last +} diff --git a/components/accelerator/nvidia/query/querier_test.go b/components/accelerator/nvidia/query/querier_test.go new file mode 100644 index 00000000..a06d698e --- /dev/null +++ b/components/accelerator/nvidia/query/querier_test.go @@ -0,0 +1,72 @@ +package query + +import ( + "reflect" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestQuerier_addResult(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + queueN int + initial []Result + newResult Result + expected []Result + }{ + { + name: "Add to empty queue", + queueN: 3, + initial: []Result{}, + newResult: Result{Time: metav1.NewTime(now)}, + expected: []Result{{Time: metav1.NewTime(now)}}, + }, + { + name: "Add to non-full queue", + queueN: 3, + initial: []Result{ + {Time: metav1.NewTime(now.Add(-2 * time.Second))}, + }, + newResult: Result{Time: metav1.NewTime(now)}, + expected: []Result{ + {Time: metav1.NewTime(now.Add(-2 * time.Second))}, + {Time: metav1.NewTime(now)}, + }, + }, + { + name: "Add to full queue", + queueN: 3, + initial: []Result{ + {Time: metav1.NewTime(now.Add(-3 * time.Second))}, + {Time: metav1.NewTime(now.Add(-2 * time.Second))}, + {Time: metav1.NewTime(now.Add(-1 * time.Second))}, + }, + newResult: Result{Time: metav1.NewTime(now)}, + expected: []Result{ + {Time: metav1.NewTime(now.Add(-2 * time.Second))}, + {Time: metav1.NewTime(now.Add(-1 * time.Second))}, + {Time: metav1.NewTime(now)}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + q := &querier{ + queueN: tt.queueN, + last: tt.initial, + } + q.addResult(tt.newResult) + if !reflect.DeepEqual(tt.expected, q.last) { + t.Errorf("expected %+v, got %+v", tt.expected, q.last) + } + if len(q.last) > tt.queueN { + t.Errorf("expected queue length of %d, got %d", tt.queueN, len(q.last)) + } + }) + } +} diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go new file mode 100644 index 00000000..49e38df1 --- /dev/null +++ b/components/accelerator/nvidia/query/query.go @@ -0,0 +1,348 @@ +// Package query implements "nvidia-smi --query" output helpers. +package query + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os/exec" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/yaml" +) + +type Result struct { + Time metav1.Time `json:"time"` + Output *Output `json:"output,omitempty"` + Error error `json:"error,omitempty"` +} + +func start(ctx context.Context, interval time.Duration) <-chan Result { + ch := make(chan Result, 1) + go func() { + defer close(ch) + + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + select { + case ch <- Result{ + Time: metav1.Time{Time: time.Now().UTC()}, + Error: ctx.Err(), + }: + default: // channel is full, skip this result and continue + } + return + + case <-ticker.C: + summaryBytes, err := Run(ctx) + if err != nil { + select { + case <-ctx.Done(): + return + case ch <- Result{ + Time: metav1.Time{Time: time.Now().UTC()}, + Error: err, + }: + default: // channel is full, skip this result and continue + } + continue + } + + queryBytes, err := Run(ctx, "--query") + if err != nil { + select { + case <-ctx.Done(): + return + case ch <- Result{ + Time: metav1.Time{Time: time.Now().UTC()}, + Error: err, + }: + default: // channel is full, skip this result and continue + } + continue + } + + output, err := Parse(queryBytes) + if err != nil { + select { + case <-ctx.Done(): + return + case ch <- Result{ + Time: metav1.Time{Time: time.Now().UTC()}, + Error: err, + }: + default: // channel is full, skip this result and continue + } + continue + } + output.Summary = string(summaryBytes) + + select { + case <-ctx.Done(): + return + case ch <- Result{ + Time: metav1.Time{Time: time.Now().UTC()}, + Output: output, + }: + default: // channel is full, skip this result and continue + } + } + } + }() + return ch +} + +func Run(ctx context.Context, args ...string) ([]byte, error) { + p, err := exec.LookPath("nvidia-smi") + if err != nil { + return nil, fmt.Errorf("nvidia-smi not found (%w)", err) + } + + ctx, cancel := context.WithTimeout(ctx, time.Minute) + defer cancel() + + cmd := exec.CommandContext(ctx, p, args...) + return cmd.Output() +} + +// Decodes the "nvidia-smi --query" output. +// ref. https://developer.nvidia.com/system-management-interface +func Parse(b []byte) (*Output, error) { + splits := bytes.Split(b, []byte("\n")) + processedLines := make([][]byte, 0, len(splits)) + + // tracks the last line to its indent level + lastIndent := 0 + gpuCursor := 0 + prevGPUID := "" + + for _, currentLine := range splits { + if len(currentLine) == 0 { + continue + } + if bytes.Contains(currentLine, []byte("===")) || bytes.Contains(currentLine, []byte("NVSMI LOG")) { + continue + } + + lastLine := []byte{} + if len(processedLines) > 0 { + lastLine = processedLines[len(processedLines)-1] + } + + indentLevel := len(currentLine) - len(bytes.TrimSpace(currentLine)) + + gpuIDLine := "" + if prevGPUID != "" { + gpuIDLine = strings.Repeat(" ", indentLevel) + "ID: " + prevGPUID + prevGPUID = "" + } + + lastKey := getKey(lastLine) + switch { + case bytes.HasPrefix(currentLine, []byte("GPU 00000")): + // e.g., + // + // GPU 00000000:53:00.0 + // + // should be converted to + // + // GPU0 + + prevGPUID = string(currentLine) + currentLine = []byte(fmt.Sprintf("GPU%d:", gpuCursor)) + gpuCursor++ + + case !bytes.HasSuffix(currentLine, []byte(":")) && !bytes.Contains(currentLine, []byte(":")): + // e.g., + // + // Driver Model + // Current : N/A + // + // should be + // + // Driver Model: + // Current : N/A + + currentLine = append(currentLine, ':') + + case bytes.HasSuffix(bytes.TrimSpace(currentLine), []byte("None")): + // e.g., + // + // Processes : None + // + // should be + // + // Processes : null + currentLine = bytes.Replace(currentLine, []byte("None"), []byte("null"), 1) + + case bytes.HasPrefix(lastKey, []byte("HW Slowdown")) || + bytes.HasPrefix(lastKey, []byte("HW Thermal Slowdown")) || + bytes.HasPrefix(lastKey, []byte("Process ID")) || + bytes.HasPrefix(lastKey, []byte("Process Type")) || + bytes.HasPrefix(lastKey, []byte("Process Name")): + // e.g., + // + // HW Slowdown : Not Active + // HW Thermal Slowdown : Not Active + // + // should be + // + // HW Slowdown : Not Active + // HW Thermal Slowdown : Not Active + + // e.g., + // + // Process ID : 1375347 + // Type : C + // Name : /usr/bin/python + // Used GPU Memory : 22372 MiB + // + // should be + // + // Process ID : 1375347 + // Process Type : C + // Process Name : /usr/bin/python + // Process Used GPU Memory : 22372 MiB + trimmed := bytes.TrimSpace(currentLine) + currentLine = bytes.Repeat([]byte(" "), lastIndent) + if bytes.HasPrefix(lastKey, []byte("Process ID")) || + bytes.HasPrefix(lastKey, []byte("Process Type")) || + bytes.HasPrefix(lastKey, []byte("Process Name")) { + currentLine = append(currentLine, []byte("Process ")...) + } + currentLine = append(currentLine, trimmed...) + } + + if gpuIDLine != "" { + processedLines = append(processedLines, []byte(gpuIDLine)) + } + + processedLines = append(processedLines, currentLine) + lastIndent = len(currentLine) - len(bytes.TrimSpace(currentLine)) + } + + processedOutput := bytes.Join(processedLines, []byte("\n")) + + raw := &rawOutput{} + if err := yaml.Unmarshal(processedOutput, raw); err != nil { + // in case nvidia-smi introduced some breaking changes + // retry with a fallback implementation + // and to retain debugging info such as driver version + fallback := &outputFallback{} + newOutput := bytes.Split(processedOutput, []byte("\nGPU"))[0] + if rerr := yaml.Unmarshal(newOutput, fallback); rerr != nil { + return nil, rerr + } + return &Output{ + Timestamp: fallback.Timestamp, + DriverVersion: fallback.DriverVersion, + CUDAVersion: fallback.CUDAVersion, + AttachedGPUs: fallback.AttachedGPUs, + }, err + } + + out := &Output{ + Timestamp: raw.Timestamp, + DriverVersion: raw.DriverVersion, + CUDAVersion: raw.CUDAVersion, + AttachedGPUs: raw.AttachedGPUs, + Raw: string(b), + } + gpuFields := []*GPU{raw.GPU0, raw.GPU1, raw.GPU2, raw.GPU3, raw.GPU4, raw.GPU5, raw.GPU6, raw.GPU7} + for _, gpu := range gpuFields { + if gpu != nil { + out.GPUs = append(out.GPUs, *gpu) + } + } + return out, nil +} + +// ref. "nvidia-smi --help-query-gpu" +type rawOutput struct { + Timestamp string `json:"Timestamp"` + DriverVersion string `json:"Driver Version"` + CUDAVersion string `json:"CUDA Version"` + AttachedGPUs int `json:"Attached GPUs"` + + GPU0 *GPU `json:"GPU0,omitempty"` + GPU1 *GPU `json:"GPU1,omitempty"` + GPU2 *GPU `json:"GPU2,omitempty"` + GPU3 *GPU `json:"GPU3,omitempty"` + GPU4 *GPU `json:"GPU4,omitempty"` + GPU5 *GPU `json:"GPU5,omitempty"` + GPU6 *GPU `json:"GPU6,omitempty"` + GPU7 *GPU `json:"GPU7,omitempty"` +} + +// ref. "nvidia-smi --help-query-gpu" +type Output struct { + Timestamp string `json:"timestamp"` + DriverVersion string `json:"driver_version"` + CUDAVersion string `json:"cuda_version"` + AttachedGPUs int `json:"attached_gpus"` + + GPUs []GPU `json:"gpus,omitempty"` + + // Raw is the raw output of "nvidia-smi --query". + // Useful for debugging. + Raw string `json:"raw"` + + // Summary is the "nvidia-smi" output without "--query" flag. + // Useful for error detecting, in case the new nvidia-smi + // version introduces breaking changes to its query output. + Summary string `json:"summary"` +} + +func (o *Output) JSON() ([]byte, error) { + return json.Marshal(o) +} + +func (o *Output) YAML() ([]byte, error) { + return yaml.Marshal(o) +} + +func getKey(line []byte) []byte { + k := bytes.Split(line, []byte(":"))[0] + return bytes.TrimSpace(k) +} + +type outputFallback struct { + Timestamp string `json:"Timestamp"` + DriverVersion string `json:"Driver Version"` + CUDAVersion string `json:"CUDA Version"` + AttachedGPUs int `json:"Attached GPUs"` +} + +const ( + ClockEventsActive = "Active" + ClockEventsNotActive = "Not Active" +) + +// Returns the detail HW Slowdown message and "true" if any of the GPU has "Active" HW Slowdown event. +func (o *Output) HasHWSlowdown() (string, bool) { + errs := make([]string, 0) + for _, gpu := range o.GPUs { + if gpu.ClockEventReasons == nil { + continue + } + if gpu.ClockEventReasons.HWSlowdown != ClockEventsActive { + continue + } + if gpu.ClockEventReasons.HWThermalSlowdown == ClockEventsActive { + errs = append(errs, fmt.Sprintf("%s: ClockEventReasons.HWSlowdown.ThermalSlowdown %s", gpu.ID, ClockEventsActive)) + } + if gpu.ClockEventReasons.HWPowerBrakeSlowdown == ClockEventsActive { + errs = append(errs, fmt.Sprintf("%s: ClockEventReasons.HWSlowdown.PowerBrakeSlowdown %s", gpu.ID, ClockEventsActive)) + } + } + if len(errs) == 0 { + return "", false + } + return strings.Join(errs, ", "), true +} diff --git a/components/accelerator/nvidia/query/query_test.go b/components/accelerator/nvidia/query/query_test.go new file mode 100644 index 00000000..62c3c3d1 --- /dev/null +++ b/components/accelerator/nvidia/query/query_test.go @@ -0,0 +1,275 @@ +package query + +import ( + "os" + "path/filepath" + "testing" +) + +func TestParseWithProcesses(t *testing.T) { + data, err := os.ReadFile("testdata/nvidia-smi-query.535.154.05.out.0.valid") + if err != nil { + t.Fatalf("failed to read file: %v", err) + } + parsed, err := Parse(data) + if err != nil { + t.Errorf("Parse returned an error: %v", err) + } + + if parsed.GPUs[0].ID != "GPU 00000000:01:00.0" { + t.Errorf("GPU0.ID mismatch: %+v", parsed.GPUs[0].ID) + } + if parsed.GPUs[0].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[0].ClockEventReasons.HWThermalSlowdown) + } + + if parsed.GPUs[7].Processes.ProcessID != 1102861 { + t.Errorf("ProcessID mismatch: %d", parsed.GPUs[7].Processes.ProcessID) + } + if parsed.GPUs[7].Processes.ProcessName != "/opt/lepton/venv/bin/python3.10" { + t.Errorf("ProcessName mismatch: %s", parsed.GPUs[7].Processes.ProcessName) + } + + yb, err := parsed.YAML() + if err != nil { + t.Errorf("YAML returned an error: %v", err) + } + t.Logf("YAML:\n%s\n", yb) +} + +func TestParseWithNoProcesses(t *testing.T) { + data, err := os.ReadFile("testdata/nvidia-smi-query.535.183.01.out.0.valid") + if err != nil { + t.Fatalf("failed to read file: %v", err) + } + parsed, err := Parse(data) + if err != nil { + t.Errorf("Parse returned an error: %v", err) + } + + if parsed.GPUs[0].ID != "GPU 00000000:53:00.0" { + t.Errorf("GPU0.ID mismatch: %+v", parsed.GPUs[0].ID) + } + if parsed.GPUs[0].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[0].ClockEventReasons.HWThermalSlowdown) + } + if parsed.GPUs[0].Temperature.Current != "36 C" { + t.Errorf("GPU0.Temperature.GPUCurrentTemp mismatch: %+v", parsed.GPUs[0].Temperature.Current) + } + if parsed.GPUs[0].GPUPowerReadings.PowerDraw != "71.97 W" { + t.Errorf("PowerDraw mismatch: %+v", parsed.GPUs[0].GPUPowerReadings.PowerDraw) + } + if parsed.GPUs[0].GPUPowerReadings.CurrentPowerLimit != "700.00 W" { + t.Errorf("CurrentPowerLimit mismatch: %+v", parsed.GPUs[0].GPUPowerReadings.CurrentPowerLimit) + } + if parsed.GPUs[0].ECCErrors.Volatile.SRAMCorrectable != "0" { + t.Errorf("GPU0.ECCErrors.Volatile.SRAMCorrectable mismatch: %+v", parsed.GPUs[0].ECCErrors.Volatile.SRAMCorrectable) + } + if parsed.GPUs[0].FBMemoryUsage.Total != "81559 MiB" { + t.Errorf("GPU0.FBMemoryUsage.Total mismatch: %+v", parsed.GPUs[0].FBMemoryUsage.Total) + } + if parsed.GPUs[0].FBMemoryUsage.Reserved != "551 MiB" { + t.Errorf("GPU0.FBMemoryUsage.Reserved mismatch: %+v", parsed.GPUs[0].FBMemoryUsage.Reserved) + } + + if parsed.GPUs[1].ID != "GPU 00000000:64:00.0" { + t.Errorf("GPU1.ID mismatch: %+v", parsed.GPUs[1].ID) + } + if parsed.GPUs[1].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[1].ClockEventReasons.HWThermalSlowdown) + } + + if parsed.GPUs[2].ID != "GPU 00000000:75:00.0" { + t.Errorf("GPU2.ID mismatch: %+v", parsed.GPUs[2].ID) + } + if parsed.GPUs[2].ClockEventReasons.SWPowerCap != ClockEventsActive { + t.Errorf("SWPowerCap mismatch: %+v", parsed.GPUs[2].ClockEventReasons.SWPowerCap) + } + if parsed.GPUs[2].ClockEventReasons.SWThermalSlowdown != ClockEventsActive { + t.Errorf("SWThermalSlowdown mismatch: %+v", parsed.GPUs[2].ClockEventReasons.SWThermalSlowdown) + } + if parsed.GPUs[2].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[2].ClockEventReasons.HWThermalSlowdown) + } + + if parsed.GPUs[3].ID != "GPU 00000000:86:00.0" { + t.Errorf("GPU3.ID mismatch: %+v", parsed.GPUs[3].ID) + } + if parsed.GPUs[3].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[3].ClockEventReasons.HWThermalSlowdown) + } + + if parsed.GPUs[4].ID != "GPU 00000000:97:00.0" { + t.Errorf("GPU4.ID mismatch: %+v", parsed.GPUs[4].ID) + } + if parsed.GPUs[4].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[4].ClockEventReasons.HWThermalSlowdown) + } + + if parsed.GPUs[5].ID != "GPU 00000000:A8:00.0" { + t.Errorf("GPU5.ID mismatch: %+v", parsed.GPUs[5].ID) + } + if parsed.GPUs[5].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[5].ClockEventReasons.HWThermalSlowdown) + } + + if parsed.GPUs[6].ID != "GPU 00000000:B9:00.0" { + t.Errorf("GPU6.ID mismatch: %+v", parsed.GPUs[6].ID) + } + if parsed.GPUs[6].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[6].ClockEventReasons.HWThermalSlowdown) + } + + if parsed.GPUs[7].ID != "GPU 00000000:CA:00.0" { + t.Errorf("GPU7.ID mismatch: %+v", parsed.GPUs[7].ID) + } + if parsed.GPUs[7].ClockEventReasons.HWThermalSlowdown != ClockEventsNotActive { + t.Errorf("HWThermalSlowdown mismatch: %+v", parsed.GPUs[7].ClockEventReasons.HWThermalSlowdown) + } + + yb, err := parsed.YAML() + if err != nil { + t.Errorf("YAML returned an error: %v", err) + } + t.Logf("YAML:\n%s\n", yb) +} + +func TestParseWithFallback(t *testing.T) { + data, err := os.ReadFile("testdata/nvidia-smi-query.535.183.01.out.0.invalid") + if err != nil { + t.Fatalf("failed to read file: %v", err) + } + + parsed, err := Parse(data) + if err == nil { + t.Errorf("Parse returned no error") + } + if parsed.CUDAVersion != "12.2" { + t.Errorf("CUDAVersion mismatch: %+v", parsed.CUDAVersion) + } +} + +func TestParseMore(t *testing.T) { + matches, err := filepath.Glob("testdata/nvidia-smi-query.*.out.*.valid") + if err != nil { + t.Fatalf("failed to glob: %v", err) + } + for _, f := range matches { + data, err := os.ReadFile(f) + if err != nil { + t.Fatalf("failed to read file: %v", err) + } + if _, err := Parse(data); err != nil { + t.Errorf("Parse returned an error: %v", err) + } + } +} + +func TestHasHWSlowdown(t *testing.T) { + tests := []struct { + name string + output *Output + wantMsg string + wantBool bool + }{ + { + name: "No HW Slowdown", + output: &Output{ + GPUs: []GPU{ + { + ClockEventReasons: &ClockEventReasons{ + HWSlowdown: ClockEventsActive, + HWThermalSlowdown: ClockEventsNotActive, + HWPowerBrakeSlowdown: ClockEventsNotActive, + }, + }, + }, + }, + wantMsg: "", + wantBool: false, + }, + { + name: "Thermal Slowdown on GPU0", + output: &Output{ + GPUs: []GPU{ + { + ID: "gpu0", + ClockEventReasons: &ClockEventReasons{ + HWSlowdown: ClockEventsActive, + HWThermalSlowdown: ClockEventsActive, + HWPowerBrakeSlowdown: ClockEventsNotActive, + }, + }, + }, + }, + wantMsg: "gpu0: ClockEventReasons.HWSlowdown.ThermalSlowdown Active", + wantBool: true, + }, + { + name: "Power Brake Slowdown on GPU1", + output: &Output{ + GPUs: []GPU{ + { + ID: "gpu0", + ClockEventReasons: &ClockEventReasons{ + HWSlowdown: ClockEventsActive, + HWThermalSlowdown: ClockEventsNotActive, + HWPowerBrakeSlowdown: ClockEventsActive, + }, + }, + }, + }, + wantMsg: "gpu0: ClockEventReasons.HWSlowdown.PowerBrakeSlowdown Active", + wantBool: true, + }, + { + name: "Multiple GPUs with Slowdowns", + output: &Output{ + GPUs: []GPU{ + { + ID: "gpu0", + ClockEventReasons: &ClockEventReasons{ + HWSlowdown: ClockEventsActive, + HWThermalSlowdown: ClockEventsActive, + HWPowerBrakeSlowdown: ClockEventsNotActive, + }, + }, + { + ID: "gpu1", + ClockEventReasons: &ClockEventReasons{ + HWSlowdown: ClockEventsActive, + HWThermalSlowdown: ClockEventsNotActive, + HWPowerBrakeSlowdown: ClockEventsActive, + }, + }, + }, + }, + wantMsg: "gpu0: ClockEventReasons.HWSlowdown.ThermalSlowdown Active, gpu1: ClockEventReasons.HWSlowdown.PowerBrakeSlowdown Active", + wantBool: true, + }, + { + name: "Nil HWSlowdown", + output: &Output{ + GPUs: []GPU{ + { + ClockEventReasons: &ClockEventReasons{}, + }, + }, + }, + wantMsg: "", + wantBool: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotMsg, gotBool := tt.output.HasHWSlowdown() + if gotMsg != tt.wantMsg { + t.Errorf("Output.HasHWSlowdown() gotMsg = %v, want %v", gotMsg, tt.wantMsg) + } + if gotBool != tt.wantBool { + t.Errorf("Output.HasHWSlowdown() gotBool = %v, want %v", gotBool, tt.wantBool) + } + }) + } +} diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.0.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.0.valid new file mode 100644 index 00000000..384008f8 --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.0.valid @@ -0,0 +1,1608 @@ +==============NVSMI LOG============== + +Timestamp : Sat Jul 6 15:09:26 2024 +Driver Version : 535.154.05 +CUDA Version : 12.2 + +Attached GPUs : 8 +GPU 00000000:01:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-313bbff0-b0a0-fd26-4820-0578bdef3a12 + Minor Number : 3 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x01 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:01:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 38 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 20328 MiB + Free : 3888 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 41 C + GPU T.Limit Temp : 42 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 67.76 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2520 MHz + SM : 2520 MHz + Memory : 10251 MHz + Video : 1965 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 970.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 58130 + Type : C + Name : /usr/bin/python + Used GPU Memory : 20322 MiB + +GPU 00000000:23:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-1a287db8-89a4-4cd7-5933-ae53d0c4d4ad + Minor Number : 2 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x2300 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x23 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:23:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 1000 KB/s + Rx Throughput : 10000 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 40 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 21356 MiB + Free : 2860 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 100 % + Memory : 53 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 58 C + GPU T.Limit Temp : 25 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 157.70 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2775 MHz + SM : 2775 MHz + Memory : 10251 MHz + Video : 2145 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 1050.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 2930819 + Type : C + Name : /usr/bin/python + Used GPU Memory : 21350 MiB + +GPU 00000000:41:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-a4f36154-6f3f-303f-e098-dec1632fd43f + Minor Number : 1 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x4100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x41 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:41:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 40 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 23044 MiB + Free : 1172 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 41 C + GPU T.Limit Temp : 43 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 59.04 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2520 MHz + SM : 2520 MHz + Memory : 10251 MHz + Video : 1965 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 945.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 2146842 + Type : C + Name : /usr/bin/python + Used GPU Memory : 23038 MiB + +GPU 00000000:61:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-025f6884-1162-3032-9aab-14e0d5622783 + Minor Number : 0 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x6100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x61 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:61:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 35 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 22790 MiB + Free : 1426 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 36 C + GPU T.Limit Temp : 47 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 57.06 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2520 MHz + SM : 2520 MHz + Memory : 10251 MHz + Video : 2010 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 935.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 3159481 + Type : C + Name : /usr/bin/python + Used GPU Memory : 22784 MiB + +GPU 00000000:81:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-751d1667-9d35-b4ef-90cf-a7bdff6aa964 + Minor Number : 7 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x8100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x81 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:81:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 35 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 24212 MiB + Free : 4 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 32 C + GPU T.Limit Temp : 52 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 20.71 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1100070 + Type : C + Name : /opt/lepton/venv/bin/python3.10 + Used GPU Memory : 24206 MiB + +GPU 00000000:A1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-63cdf48e-0e17-dc82-873d-e0af7f380d0b + Minor Number : 6 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xa100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xA1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:A1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 34 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 31 C + GPU T.Limit Temp : 52 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 17.62 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:C1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-52688fe6-e746-6dec-48ea-96cd464a0db1 + Minor Number : 5 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xc100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xC1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:C1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 8000 KB/s + Rx Throughput : 49000 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 74 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 22378 MiB + Free : 1838 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 91 % + Memory : 56 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 62 C + GPU T.Limit Temp : 16 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 313.42 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2685 MHz + SM : 2685 MHz + Memory : 10251 MHz + Video : 2070 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 1050.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1375347 + Type : C + Name : /usr/bin/python + Used GPU Memory : 22372 MiB + +GPU 00000000:E1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-94f6cd0f-cb66-91a9-a3b0-7253e9f9a0ba + Minor Number : 4 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xe100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xE1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:E1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 33 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 6488 MiB + Free : 17728 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 32 C + GPU T.Limit Temp : 52 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 25.91 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 885.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1102861 + Type : C + Name : /opt/lepton/venv/bin/python3.10 + Used GPU Memory : 6482 MiB diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.1.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.1.valid new file mode 100644 index 00000000..dab308af --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.1.valid @@ -0,0 +1,1580 @@ + +==============NVSMI LOG============== + +Timestamp : Mon Jul 8 08:11:21 2024 +Driver Version : 535.154.05 +CUDA Version : 12.2 + +Attached GPUs : 8 +GPU 00000000:01:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-370465f5-30e0-6877-b85c-16729f41b90f + Minor Number : 3 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x01 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:01:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 0 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 30 C + GPU T.Limit Temp : 53 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 18.90 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:23:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-8f4d1755-9c8a-3a19-b71e-c190b3801dec + Minor Number : 2 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x2300 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x23 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:23:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 1000 KB/s + Rx Throughput : 4000 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 86 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 21608 MiB + Free : 2608 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 97 % + Memory : 35 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 72 C + GPU T.Limit Temp : 11 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 445.38 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2640 MHz + SM : 2640 MHz + Memory : 10251 MHz + Video : 2040 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 1020.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1285135 + Type : C + Name : /usr/bin/python + Used GPU Memory : 21602 MiB + +GPU 00000000:41:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-2824147b-f0cc-0462-ce13-25cc09ad5f25 + Minor Number : 1 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x4100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x41 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:41:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 33 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 31 C + GPU T.Limit Temp : 53 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 15.73 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:61:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-4387c558-c1f1-5370-3c65-e0135e9803f2 + Minor Number : 0 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x6100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x61 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:61:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 34 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 30 C + GPU T.Limit Temp : 53 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 31.10 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:81:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-ad8f4544-e918-708f-cab7-69d179ebdd8a + Minor Number : 7 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x8100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x81 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:81:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 1000 KB/s + Rx Throughput : 4000 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 98 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 23638 MiB + Free : 578 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 100 % + Memory : 37 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 76 C + GPU T.Limit Temp : 8 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 446.36 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2520 MHz + SM : 2520 MHz + Memory : 10251 MHz + Video : 1995 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 940.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1292330 + Type : C + Name : /usr/bin/python + Used GPU Memory : 23632 MiB + +GPU 00000000:A1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-ff7fd1e7-6ad3-a7f7-30fc-d7d0bda6b911 + Minor Number : 6 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xa100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xA1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:A1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 34 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 31 C + GPU T.Limit Temp : 53 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 19.76 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 885.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:C1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-5dbe62cf-84b6-cfcb-1bc6-dc3c28b4fe25 + Minor Number : 5 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xc100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xC1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:C1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 33 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 29 C + GPU T.Limit Temp : 54 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 30.08 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:E1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-839379d5-33cc-2fae-2c64-a782cd63010e + Minor Number : 4 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xe100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xE1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:E1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 32 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 29 C + GPU T.Limit Temp : 54 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 16.78 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.2.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.2.valid new file mode 100644 index 00000000..f0b2c4ac --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.154.05.out.2.valid @@ -0,0 +1,1584 @@ + + + + +==============NVSMI LOG============== + +Timestamp : Mon Jul 8 05:36:06 2024 +Driver Version : 535.154.05 +CUDA Version : 12.2 + +Attached GPUs : 8 +GPU 00000000:01:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-370465f5-30e0-6877-b85c-16729f41b90f + Minor Number : 3 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x01 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:01:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 0 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 50 C + GPU T.Limit Temp : 34 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 22.43 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 870.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:23:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-8f4d1755-9c8a-3a19-b71e-c190b3801dec + Minor Number : 2 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x2300 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x23 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:23:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 1000 KB/s + Rx Throughput : 5000 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 91 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 22090 MiB + Free : 2126 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 100 % + Memory : 35 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 74 C + GPU T.Limit Temp : 11 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 405.59 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 1763 MHz + SM : 2640 MHz + Memory : 10251 MHz + Video : 2040 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 1020.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1285135 + Type : C + Name : /usr/bin/python + Used GPU Memory : 22084 MiB + +GPU 00000000:41:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-2824147b-f0cc-0462-ce13-25cc09ad5f25 + Minor Number : 1 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x4100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x41 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:41:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 33 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 34 C + GPU T.Limit Temp : 50 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 17.04 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 885.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:61:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-4387c558-c1f1-5370-3c65-e0135e9803f2 + Minor Number : 0 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x6100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x61 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:61:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 34 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 30 C + GPU T.Limit Temp : 53 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 30.72 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:81:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-ad8f4544-e918-708f-cab7-69d179ebdd8a + Minor Number : 7 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0x8100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x81 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:81:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 4 + Device Current : 4 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 1000 KB/s + Rx Throughput : 7000 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 91 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 22844 MiB + Free : 1372 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 96 % + Memory : 36 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 74 C + GPU T.Limit Temp : 9 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 435.24 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 2730 MHz + SM : 2730 MHz + Memory : 10251 MHz + Video : 2115 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 1040.000 mV + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1292330 + Type : C + Name : /usr/bin/python + Used GPU Memory : 22838 MiB + +GPU 00000000:A1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-ff7fd1e7-6ad3-a7f7-30fc-d7d0bda6b911 + Minor Number : 6 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xa100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xA1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:A1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 34 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 30 C + GPU T.Limit Temp : 54 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 19.40 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 885.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:C1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-5dbe62cf-84b6-cfcb-1bc6-dc3c28b4fe25 + Minor Number : 5 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xc100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xC1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:C1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 33 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 29 C + GPU T.Limit Temp : 54 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 30.47 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:E1:00.0 + Product Name : NVIDIA GeForce RTX 4090 + Product Brand : GeForce + Product Architecture : Ada Lovelace + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-839379d5-33cc-2fae-2c64-a782cd63010e + Minor Number : 4 + VBIOS Version : 95.02.18.C0.09 + MultiGPU Board : No + Board ID : 0xe100 + Board Part Number : N/A + GPU Part Number : 2684-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G002.0000.00.03 + OEM Object : 2.0 + ECC Object : 6.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xE1 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x268410DE + Bus Id : 00000000:E1:00.0 + Sub System Id : 0x167C10DE + GPU Link Info + PCIe Generation + Max : 4 + Current : 1 + Device Current : 1 + Device Max : 4 + Host Max : 4 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 32 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 24564 MiB + Reserved : 346 MiB + Used : 0 MiB + Free : 24217 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 1 MiB + Free : 255 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Disabled + Pending : Disabled + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 192 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 29 C + GPU T.Limit Temp : 54 C + GPU Shutdown T.Limit Temp : -7 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating T.Limit Temp : N/A + GPU Power Readings + Power Draw : 17.02 W + Current Power Limit : 450.00 W + Requested Power Limit : 450.00 W + Default Power Limit : 450.00 W + Min Power Limit : 150.00 W + Max Power Limit : 450.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 210 MHz + SM : 210 MHz + Memory : 405 MHz + Video : 1185 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 3105 MHz + SM : 3105 MHz + Memory : 10501 MHz + Video : 2415 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 890.000 mV + Fabric + State : N/A + Status : N/A + Processes : None + + diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.invalid b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.invalid new file mode 100644 index 00000000..87be3921 --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.invalid @@ -0,0 +1,91 @@ +==============NVSMI LOG============== + +Timestamp : Fri Jun 21 02:03:08 2024 +Driver Version : 535.183.01 +CUDA Version : 12.2 + +Attached GPUs : 8 +GPU 00000000:53:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1655022005478 + GPU UUID : GPU-566e6f65-c38e-b12f-4824-6944218668b9 + Minor Number : 0 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0x5300 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 7 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/20 21:24:53.799 + Latest Duration : 88355 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x53 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:53:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 120 + Replay Number Rollovers : 0 + Tx Throughput : 632 KB/s + Rx Throughput : 531 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + Invalid HW Slowdown : Not Active + Invalid HW Thermal Slowdown : Not Active + Invalid HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active diff --git a/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.valid b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.valid new file mode 100644 index 00000000..0ecfed00 --- /dev/null +++ b/components/accelerator/nvidia/query/testdata/nvidia-smi-query.535.183.01.out.0.valid @@ -0,0 +1,1652 @@ +==============NVSMI LOG============== + +Timestamp : Fri Jun 21 02:03:08 2024 +Driver Version : 535.183.01 +CUDA Version : 12.2 + +Attached GPUs : 8 +GPU 00000000:53:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1655022005478 + GPU UUID : GPU-566e6f65-c38e-b12f-4824-6944218668b9 + Minor Number : 0 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0x5300 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 7 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/20 21:24:53.799 + Latest Duration : 88355 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x53 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:53:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 120 + Replay Number Rollovers : 0 + Tx Throughput : 632 KB/s + Rx Throughput : 531 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 0 MiB + Free : 81007 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 1 MiB + Free : 131071 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 36 C + GPU T.Limit Temp : 50 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 44 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 71.97 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 345 MHz + SM : 345 MHz + Memory : 2619 MHz + Video : 765 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 665.000 mV + Fabric + State : Completed + Status : Success + Processes : None + +GPU 00000000:64:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1654522012780 + GPU UUID : GPU-9930d717-21dd-7692-054e-3d0a2bae76b5 + Minor Number : 1 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0x6400 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 5 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/20 21:18:47.468 + Latest Duration : 94925 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x64 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:64:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 111 + Replay Number Rollovers : 0 + Tx Throughput : 625 KB/s + Rx Throughput : 500 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 0 MiB + Free : 81007 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 1 MiB + Free : 131071 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 37 C + GPU T.Limit Temp : 49 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 44 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 68.88 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 345 MHz + SM : 345 MHz + Memory : 2619 MHz + Video : 765 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 685.000 mV + Fabric + State : Completed + Status : Success + Processes : None + +GPU 00000000:75:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1654922018857 + GPU UUID : GPU-973565ff-45cf-12ea-4e91-33386720f7bb + Minor Number : 2 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0x7500 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 6 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x75 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:75:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 58 + Replay Number Rollovers : 0 + Tx Throughput : 4113 KB/s + Rx Throughput : 414 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Active + Sync Boost : Not Active + SW Thermal Slowdown : Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 9221 MiB + Free : 71786 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 4 MiB + Free : 131068 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 39 C + GPU T.Limit Temp : 47 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 46 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 118.01 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 940.000 mV + Fabric + State : Completed + Status : Success + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 1002127 + Type : C + Name : /usr/bin/python + Used GPU Memory : 9212 MiB + +GPU 00000000:86:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1654522012479 + GPU UUID : GPU-6b932a61-abfd-5ea1-b26a-7dea7f2fa601 + Minor Number : 3 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0x8600 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 8 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/20 21:24:54.176 + Latest Duration : 93746 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x86 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:86:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 60 + Replay Number Rollovers : 0 + Tx Throughput : 632 KB/s + Rx Throughput : 496 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 0 MiB + Free : 81007 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 1 MiB + Free : 131071 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 39 C + GPU T.Limit Temp : 47 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 46 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 72.91 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 345 MHz + SM : 345 MHz + Memory : 2619 MHz + Video : 765 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 680.000 mV + Fabric + State : Completed + Status : Success + Processes : None + +GPU 00000000:97:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1654922016523 + GPU UUID : GPU-726d356b-f548-bbfd-c81e-b9c68f2f1587 + Minor Number : 4 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0x9700 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/20 21:24:53.495 + Latest Duration : 123052 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x97 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:97:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 63 + Replay Number Rollovers : 0 + Tx Throughput : 613 KB/s + Rx Throughput : 468 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 0 MiB + Free : 81007 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 1 MiB + Free : 131071 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 38 C + GPU T.Limit Temp : 48 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 46 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 71.68 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 345 MHz + SM : 345 MHz + Memory : 2619 MHz + Video : 765 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 680.000 mV + Fabric + State : Completed + Status : Success + Processes : None + +GPU 00000000:A8:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1655022006092 + GPU UUID : GPU-06db64f6-6181-5a90-cf95-5ce52a58c4e5 + Minor Number : 5 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0xa800 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 3 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/21 00:38:44.110 + Latest Duration : 124923 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xA8 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:A8:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 93 + Replay Number Rollovers : 0 + Tx Throughput : 613 KB/s + Rx Throughput : 527 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 0 MiB + Free : 81007 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 1 MiB + Free : 131071 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 36 C + GPU T.Limit Temp : 50 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 43 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 71.69 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 345 MHz + SM : 345 MHz + Memory : 2619 MHz + Video : 765 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 680.000 mV + Fabric + State : Completed + Status : Success + Processes : None + +GPU 00000000:B9:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1654522012843 + GPU UUID : GPU-a5271aa6-4cdb-28c7-0ef5-5da271d0caae + Minor Number : 6 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0xb900 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 4 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/20 21:24:53.400 + Latest Duration : 89412 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xB9 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:B9:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 69 + Replay Number Rollovers : 0 + Tx Throughput : 621 KB/s + Rx Throughput : 480 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 0 MiB + Free : 81007 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 1 MiB + Free : 131071 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 37 C + GPU T.Limit Temp : 50 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 44 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 69.44 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 345 MHz + SM : 345 MHz + Memory : 2619 MHz + Video : 765 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 680.000 mV + Fabric + State : Completed + Status : Success + Processes : None + +GPU 00000000:CA:00.0 + Product Name : NVIDIA H100 80GB HBM3 + Product Brand : NVIDIA + Product Architecture : Hopper + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : Disabled + Pending : Disabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : 1654922015414 + GPU UUID : GPU-7e78e6c8-c71c-66c7-9ded-f8836cecab32 + Minor Number : 7 + VBIOS Version : 96.00.74.00.01 + MultiGPU Board : No + Board ID : 0xca00 + Board Part Number : 692-2G520-0200-000 + GPU Part Number : 2330-885-A1 + FRU Part Number : N/A + Module ID : 2 + Inforom Version + Image Version : G520.0200.00.05 + OEM Object : 2.1 + ECC Object : 7.16 + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : 2024/06/20 21:24:54.938 + Latest Duration : 121894 us + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : 535.183.01 + GPU Virtualization Mode + Virtualization Mode : Pass-Through + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : No + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xCA + Device : 0x00 + Domain : 0x0000 + Device Id : 0x233010DE + Bus Id : 00000000:CA:00.0 + Sub System Id : 0x16C110DE + GPU Link Info + PCIe Generation + Max : 5 + Current : 5 + Device Current : 5 + Device Max : 5 + Host Max : N/A + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 78 + Replay Number Rollovers : 0 + Tx Throughput : 613 KB/s + Rx Throughput : 531 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : N/A + Performance State : P0 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + Sparse Operation Mode : Disabled + FB Memory Usage + Total : 81559 MiB + Reserved : 551 MiB + Used : 0 MiB + Free : 81007 MiB + BAR1 Memory Usage + Total : 131072 MiB + Used : 1 MiB + Free : 131071 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : Enabled + Pending : Enabled + ECC Errors + Volatile + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + Aggregate + SRAM Correctable : 0 + SRAM Uncorrectable Parity : 0 + SRAM Uncorrectable SEC-DED : 0 + DRAM Correctable : 0 + DRAM Uncorrectable : 0 + SRAM Threshold Exceeded : No + Aggregate Uncorrectable SRAM Sources + SRAM L2 : 0 + SRAM SM : 0 + SRAM Microcontroller : 0 + SRAM PCIE : 0 + SRAM Other : 0 + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows + Correctable Error : 0 + Uncorrectable Error : 0 + Pending : No + Remapping Failure Occurred : No + Bank Remap Availability Histogram + Max : 2560 bank(s) + High : 0 bank(s) + Partial : 0 bank(s) + Low : 0 bank(s) + None : 0 bank(s) + Temperature + GPU Current Temp : 35 C + GPU T.Limit Temp : 51 C + GPU Shutdown T.Limit Temp : -8 C + GPU Slowdown T.Limit Temp : -2 C + GPU Max Operating T.Limit Temp : 0 C + GPU Target Temperature : N/A + Memory Current Temp : 43 C + Memory Max Operating T.Limit Temp : 0 C + GPU Power Readings + Power Draw : 74.76 W + Current Power Limit : 700.00 W + Requested Power Limit : 700.00 W + Default Power Limit : 700.00 W + Min Power Limit : 200.00 W + Max Power Limit : 700.00 W + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 345 MHz + SM : 345 MHz + Memory : 2619 MHz + Video : 765 MHz + Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Default Applications Clocks + Graphics : 1980 MHz + Memory : 2619 MHz + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 1980 MHz + SM : 1980 MHz + Memory : 2619 MHz + Video : 1545 MHz + Max Customer Boost Clocks + Graphics : 1980 MHz + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : 720.000 mV + Fabric + State : Completed + Status : Success + Processes : None diff --git a/components/components.go b/components/components.go index 59ae5455..6d91eef9 100644 --- a/components/components.go +++ b/components/components.go @@ -8,13 +8,26 @@ import ( "github.com/leptonai/leptond/errdefs" ) +// Component represents an individual component of the system. +// +// Each component check is independent of each other. +// But the underlying implementation may share the same data sources +// in order to minimize the querying overhead (e.g., nvidia-smi calls). +// +// Each component implements its own output format inside the State struct. +// And recommended to have a consistent name for its HTTP handler. +// And recommended to define const keys for the State extra information field. type Component interface { + // Defines the component name, + // and used for the HTTP handler registration path. + // Must be globally unique. Name() string State(ctx context.Context) ([]State, error) SetState(ctx context.Context, states ...State) error Events(ctx context.Context, since time.Time) ([]Event, error) + // Called upon server close. Close() error } diff --git a/go.mod b/go.mod index 2b1c9103..64edf0c1 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.22.4 require ( github.com/coreos/go-systemd/v22 v22.5.0 + github.com/dustin/go-humanize v1.0.1 github.com/gin-contrib/requestid v1.0.2 github.com/gin-contrib/zap v1.1.3 github.com/gin-gonic/gin v1.10.0 @@ -11,6 +12,7 @@ require ( github.com/urfave/cli v1.22.15 go.uber.org/zap v1.27.0 golang.org/x/sys v0.20.0 + k8s.io/apimachinery v0.30.2 sigs.k8s.io/yaml v1.4.0 ) @@ -22,11 +24,14 @@ require ( github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect github.com/gabriel-vasile/mimetype v1.4.3 // indirect github.com/gin-contrib/sse v0.1.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.20.0 // indirect github.com/goccy/go-json v0.10.2 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/cpuid/v2 v2.2.7 // indirect @@ -50,5 +55,11 @@ require ( golang.org/x/net v0.25.0 // indirect golang.org/x/text v0.15.0 // indirect google.golang.org/protobuf v1.34.1 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.120.1 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect ) diff --git a/go.sum b/go.sum index 4d61b920..34825887 100644 --- a/go.sum +++ b/go.sum @@ -14,6 +14,8 @@ github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= github.com/gin-contrib/requestid v1.0.2 h1:MRJqVwmpHAbkkF3ENgtDWU41l5ICmmVy01q2ZDYI1BE= @@ -24,6 +26,8 @@ github.com/gin-contrib/zap v1.1.3 h1:9e/U9fYd4/OBfmSEBs5hHZq114uACn7bpuzvCkcJySA github.com/gin-contrib/zap v1.1.3/go.mod h1:+BD/6NYZKJyUpqVoJEvgeq9GLz8pINEQvak9LHNOTSE= github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU= github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= @@ -37,15 +41,21 @@ github.com/go-playground/validator/v10 v10.20.0/go.mod h1:dbuPbCMFw/DrkbEynArYaC github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= @@ -75,6 +85,8 @@ github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFt github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -97,6 +109,8 @@ github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65E github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= github.com/urfave/cli v1.22.15 h1:nuqt+pdC/KqswQKhETJjo7pvn/k4xMUxgW6liI7XpnM= github.com/urfave/cli v1.22.15/go.mod h1:wSan1hmo5zeyLGBjRJbzRTNk8gwoYa2B9n4q9dmRIc0= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -108,11 +122,26 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc= golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -120,18 +149,41 @@ golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/apimachinery v0.30.2 h1:fEMcnBj6qkzzPGSVsAZtQThU62SmQ4ZymlXRC5yFSCg= +k8s.io/apimachinery v0.30.2/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= +k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= +k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/internal/server/config/config.go b/internal/server/config/config.go index 4244953f..5d22b296 100644 --- a/internal/server/config/config.go +++ b/internal/server/config/config.go @@ -14,6 +14,9 @@ type Config struct { // Metrics settings Address string `json:"address"` + + // Component specific configurations. + Components map[string]any `json:"components,omitempty"` } func (config *Config) YAML() ([]byte, error) { diff --git a/internal/server/config/config_test.go b/internal/server/config/config_test.go new file mode 100644 index 00000000..3033dbf6 --- /dev/null +++ b/internal/server/config/config_test.go @@ -0,0 +1,17 @@ +package config + +import ( + "testing" +) + +func TestLoadConfigYAML(t *testing.T) { + config, err := LoadConfigYAML("testdata/test.0.yaml") + if err != nil { + t.Fatalf("failed to parse config: %v", err) + } + b, err := config.YAML() + if err != nil { + t.Fatalf("failed to marshal config: %v", err) + } + t.Logf("config:\n%s", string(b)) +} diff --git a/internal/server/config/testdata/test.0.yaml b/internal/server/config/testdata/test.0.yaml new file mode 100644 index 00000000..5ba078da --- /dev/null +++ b/internal/server/config/testdata/test.0.yaml @@ -0,0 +1,9 @@ +annotations: + a: b + c: d + +address: 127.0.0.1:123 + +components: + accelerator-nvidia-info: + interval: 10s diff --git a/internal/server/server.go b/internal/server/server.go index 8a4763db..e36d85ff 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -1,11 +1,15 @@ package server import ( + "context" + "fmt" "io" "net/http" "strings" "github.com/leptonai/leptond/components" + nvidiainfo "github.com/leptonai/leptond/components/accelerator/nvidia/info" + nvidiaquery "github.com/leptonai/leptond/components/accelerator/nvidia/query" "github.com/leptonai/leptond/components/info" "github.com/leptonai/leptond/components/os" lepConfig "github.com/leptonai/leptond/internal/server/config" @@ -19,7 +23,7 @@ type Server struct { config *lepConfig.Config } -func New(config *lepConfig.Config) (*Server, error) { +func New(ctx context.Context, config *lepConfig.Config) (*Server, error) { s := &Server{} router := gin.Default() @@ -30,20 +34,35 @@ func New(config *lepConfig.Config) (*Server, error) { ctx.JSON(http.StatusOK, gin.H{"status": "ok", "version": "v1"}) }) - all := []components.Component{ + // start with default components + allComponents := []components.Component{ os.New(), info.New(config.Annotations), } - components.NewComponentSet() + // now iterate all components in the config + for k := range config.Components { + switch k { + case nvidiainfo.Name: + cfg, err := nvidiaquery.ParseQuerierConfig(config.Components[k]) + if err != nil { + return nil, fmt.Errorf("failed to parse component %s config: %w", k, err) + } + allComponents = append(allComponents, nvidiainfo.New(ctx, cfg.Interval)) + default: + return nil, fmt.Errorf("unknown component %s", k) + } + } - for _, c := range all { + components.NewComponentSet() + for _, c := range allComponents { // this guarantees no name conflict, thus safe to register handlers by its name if err := components.RegComponent(c.Name(), c); err != nil { log.Logger.Warnf("failed to register component %s: %v", c.Name(), err) continue } } + // TODO: implement configuration file refresh + apply v1 := router.Group("/v1") NewGlobalHandler(components.GetAllComponents()).AddToRoute(v1)