Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu: support reading usage of multiple cards on linux #79

Merged
merged 1 commit into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ require (
github.com/refraction-networking/utls v1.6.3
github.com/shirou/gopsutil/v4 v4.24.9
github.com/spf13/viper v1.19.0
github.com/tidwall/gjson v1.18.0
github.com/urfave/cli/v2 v2.27.5
golang.org/x/net v0.29.0
golang.org/x/sys v0.25.0
Expand Down Expand Up @@ -74,6 +75,8 @@ require (
github.com/spf13/pflag v1.0.5 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/tcnksm/go-gitconfig v0.1.2 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.0 // indirect
github.com/tklauser/go-sysconf v0.3.12 // indirect
github.com/tklauser/numcpus v0.6.1 // indirect
github.com/ulikunitz/xz v0.5.11 // indirect
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8
github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
github.com/tcnksm/go-gitconfig v0.1.2 h1:iiDhRitByXAEyjgBqsKi9QU4o2TNtv9kPP3RgPgXBPw=
github.com/tcnksm/go-gitconfig v0.1.2/go.mod h1:/8EhP4H7oJZdIPyT+/UIsG87kTzrzM4UsLGSItWYCpE=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=
github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
Expand Down
2 changes: 1 addition & 1 deletion model/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ type HostState struct {
UdpConnCount uint64
ProcessCount uint64
Temperatures []SensorTemperature
GPU float64
GPU []float64
}

func (s *HostState) PB() *pb.State {
Expand Down
27 changes: 0 additions & 27 deletions pkg/gpu/gpu.go

This file was deleted.

5 changes: 3 additions & 2 deletions pkg/gpu/gpu_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,9 @@ func GetGPUModel() ([]string, error) {
return util.RemoveDuplicate(models), nil
}

func FindUtilization(key, dictKey string) (int, error) {
return findUtilization(key, dictKey)
func GetGPUStat() ([]float64, error) {
usage, err := findUtilization("PerformanceStatistics", "Device Utilization %")
return []float64{float64(usage)}, err
}

func findDevices(key string) ([]string, error) {
Expand Down
11 changes: 11 additions & 0 deletions pkg/gpu/gpu_fallback.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//go:build !darwin && !linux && !windows

package gpu

func GetGPUModel() ([]string, error) {
return nil, nil
}

func GetGPUStat() ([]float64, error) {
return nil, nil
}
125 changes: 125 additions & 0 deletions pkg/gpu/gpu_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
//go:build linux

package gpu

import (
"errors"

"github.com/nezhahq/agent/pkg/gpu/vendor"
)

const (
vendorAMD = iota + 1
vendorNVIDIA
)

var vendorType uint8

func init() {
_, err := getNvidiaStat()
if err != nil {
vendorType = vendorAMD
} else {
vendorType = vendorNVIDIA
}
}

func getNvidiaStat() ([]float64, error) {
smi := &vendor.NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err1 := smi.Start()
if err1 != nil {
return nil, err1
}
data, err2 := smi.GatherUsage()
if err2 != nil {
return nil, err2
}
return data, nil
}

func getAMDStat() ([]float64, error) {
rsmi := &vendor.ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherUsage()
if err != nil {
return nil, err
}
return data, nil
}

func getNvidiaHost() ([]string, error) {
smi := &vendor.NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err := smi.Start()
if err != nil {
return nil, err
}
data, err := smi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}

func getAMDHost() ([]string, error) {
rsmi := &vendor.ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}

func GetGPUModel() ([]string, error) {
var gi []string
var err error

switch vendorType {
case vendorAMD:
gi, err = getAMDHost()
case vendorNVIDIA:
gi, err = getNvidiaHost()
default:
return nil, errors.New("invalid vendor")
}

if err != nil {
return nil, err
}

return gi, nil
}

func GetGPUStat() ([]float64, error) {
var gs []float64
var err error

switch vendorType {
case vendorAMD:
gs, err = getAMDStat()
case vendorNVIDIA:
gs, err = getNvidiaStat()
default:
return nil, errors.New("invalid vendor")
}

if err != nil {
return nil, err
}

return gs, nil
}
59 changes: 37 additions & 22 deletions pkg/gpu/stat/stat_windows.go → pkg/gpu/gpu_windows.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
//go:build windows

// Modified from https://github.com/shirou/gopsutil/blob/master/internal/common/common_windows.go
// Original License: BSD-3-Clause

package stat
package gpu

import (
"errors"
"fmt"
"time"
"unsafe"

"github.com/jaypipes/ghw"
"golang.org/x/sys/windows"
)

Expand Down Expand Up @@ -43,6 +41,41 @@ type PDH_FMT_COUNTERVALUE_ITEM_DOUBLE struct {
FmtValue PDH_FMT_COUNTERVALUE_DOUBLE
}

func GetGPUModel() ([]string, error) {
var gpuModel []string
gi, err := ghw.GPU(ghw.WithDisableWarnings())
if err != nil {
return nil, err
}

for _, card := range gi.GraphicsCards {
if card.DeviceInfo == nil {
return nil, errors.New("Cannot find device info")
}
gpuModel = append(gpuModel, card.DeviceInfo.Product.Name)
}

return gpuModel, nil
}

func GetGPUStat() ([]float64, error) {
counter, err := newWin32PerformanceCounter("gpu_utilization", "\\GPU Engine(*engtype_3D)\\Utilization Percentage")
if err != nil {
return nil, err
}
defer pdhCloseQuery.Call(uintptr(counter.Query))

values, err := getValue(8192, counter)
if err != nil {
return nil, err
}
tot := sumArray(values)
if tot > 100 {
tot = 100
}
return []float64{tot}, nil
}

// https://github.com/influxdata/telegraf/blob/master/plugins/inputs/win_perf_counters/performance_query.go
func getCounterArrayValue(initialBufSize uint32, counter *win32PerformanceCounter) ([]float64, error) {
for buflen := initialBufSize; buflen <= 100*1024*1024; buflen *= 2 {
Expand Down Expand Up @@ -127,24 +160,6 @@ func getValue(initialBufSize uint32, counter *win32PerformanceCounter) ([]float6
return getCounterArrayValue(initialBufSize, counter)
}

func GetGPUStat() (float64, error) {
counter, err := newWin32PerformanceCounter("gpu_utilization", "\\GPU Engine(*engtype_3D)\\Utilization Percentage")
if err != nil {
return 0, err
}
defer pdhCloseQuery.Call(uintptr(counter.Query))

values, err := getValue(8192, counter)
if err != nil {
return 0, err
}
tot := sumArray(values)
if tot > 100 {
tot = 100
}
return tot, nil
}

func sumArray(arr []float64) float64 {
var sum float64
for _, value := range arr {
Expand Down
67 changes: 0 additions & 67 deletions pkg/gpu/stat/amd_rocm_smi.go

This file was deleted.

12 changes: 0 additions & 12 deletions pkg/gpu/stat/stat_darwin.go

This file was deleted.

7 changes: 0 additions & 7 deletions pkg/gpu/stat/stat_freebsd.go

This file was deleted.

Loading