Skip to content

Commit

Permalink
GPU Health API improvements (#70)
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov authored Jul 24, 2024
1 parent eeff1dd commit f83cdef
Show file tree
Hide file tree
Showing 9 changed files with 938 additions and 502 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

GOLANG_VERSION := 1.14.2
GOLANGCILINT_TIMEOUT ?= 10m

.PHONY: all binary install check-format
all: binary test-main check-format
Expand Down Expand Up @@ -45,4 +46,4 @@ clean:
rm -f samples/topology/topology

lint:
golangci-lint run ./...
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
1,081 changes: 610 additions & 471 deletions pkg/dcgm/const.go

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions pkg/dcgm/gpu_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package dcgm
#include "dcgm_structs.h"
*/
import "C"

import (
"encoding/binary"
"fmt"
Expand Down Expand Up @@ -92,3 +93,34 @@ func DestroyGroup(groupId GroupHandle) (err error) {

return
}

type GroupInfo struct {
Version uint32
GroupName string
EntityList []GroupEntityPair
}

func GetGroupInfo(groupId GroupHandle) (*GroupInfo, error) {
response := C.dcgmGroupInfo_v2{
version: C.dcgmGroupInfo_version2,
}

result := C.dcgmGroupGetInfo(handle.handle, groupId.handle, &response)
if err := errorString(result); err != nil {
return nil, err
}

ret := &GroupInfo{
Version: uint32(response.version),
GroupName: C.GoString(&response.groupName[0]),
}

for i := 0; i < int(response.count); i++ {
ret.EntityList = append(ret.EntityList, GroupEntityPair{
EntityId: uint(response.entityList[i].entityId),
EntityGroupId: Field_Entity_Group(response.entityList[i].entityGroupId),
})
}

return ret, nil
}
28 changes: 28 additions & 0 deletions pkg/dcgm/gpu_group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestGroupHandle(t *testing.T) {
Expand All @@ -17,3 +18,30 @@ func TestGroupHandle(t *testing.T) {
assert.Equal(t, input, gh.GetHandle(), "values mismatch")
}
}

func TestGetGroupInfo(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)

runOnlyWithLiveGPUs(t)
gpus, err := withInjectionGPUs(t, 1)
require.NoError(t, err)

gpuID := gpus[0]

groupID, err := CreateGroup("test1")
require.NoError(t, err)
defer func() {
_ = DestroyGroup(groupID)
}()
err = AddEntityToGroup(groupID, FE_GPU, gpuID)
require.NoError(t, err)

grInfo, err := GetGroupInfo(groupID)
require.NoError(t, err)

assert.Equal(t, "test1", grInfo.GroupName)
assert.Len(t, grInfo.EntityList, 1)
assert.Equal(t, FE_GPU, grInfo.EntityList[0].EntityGroupId)
assert.Equal(t, gpuID, grInfo.EntityList[0].EntityId)
}
121 changes: 102 additions & 19 deletions pkg/dcgm/health.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,27 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgm

/*
#include "dcgm_agent.h"
#include "dcgm_structs.h"
*/
import "C"

import (
"fmt"
"math/rand"
Expand All @@ -23,14 +40,84 @@ type DeviceHealth struct {
Watches []SystemWatch
}

func setHealthWatches(groupId GroupHandle) (err error) {
result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL)
// HealthSet enable the DCGM health check system for the given systems
func HealthSet(groupId GroupHandle, systems HealthSystem) (err error) {
result := C.dcgmHealthSet(handle.handle, groupId.handle, C.dcgmHealthSystems_t(systems))
if err = errorString(result); err != nil {
return fmt.Errorf("Error setting health watches: %s", err)
return fmt.Errorf("error setting health watches: %w", err)
}
return
}

// HealthGet retrieve the current state of the DCGM health check system
func HealthGet(groupId GroupHandle) (HealthSystem, error) {
var systems C.dcgmHealthSystems_t

result := C.dcgmHealthGet(handle.handle, groupId.handle, (*C.dcgmHealthSystems_t)(unsafe.Pointer(&systems)))
if err := errorString(result); err != nil {
return HealthSystem(0), err
}
return HealthSystem(systems), nil
}

type DiagErrorDetail struct {
Message string
Code HealthCheckErrorCode
}

type Incident struct {
System HealthSystem
Health HealthResult
Error DiagErrorDetail
EntityInfo GroupEntityPair
}

type HealthResponse struct {
OverallHealth HealthResult
Incidents []Incident
}

// HealthCheck check the configured watches for any errors/failures/warnings that have occurred
// since the last time this check was invoked. On the first call, stateful information
// about all of the enabled watches within a group is created but no error results are
// provided. On subsequent calls, any error information will be returned.
func HealthCheck(groupId GroupHandle) (HealthResponse, error) {
var healthResults C.dcgmHealthResponse_v4
healthResults.version = makeVersion4(unsafe.Sizeof(healthResults))

result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))

if err := errorString(result); err != nil {
return HealthResponse{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result}
}

response := HealthResponse{
OverallHealth: HealthResult(healthResults.overallHealth),
}

// number of watches that encountred error/warning
incidents := uint(healthResults.incidentCount)

response.Incidents = make([]Incident, incidents)

for i := uint(0); i < incidents; i++ {
response.Incidents[i] = Incident{
System: HealthSystem(healthResults.incidents[i].system),
Health: HealthResult(healthResults.incidents[i].health),
Error: DiagErrorDetail{
Message: *stringPtr(&healthResults.incidents[i].error.msg[0]),
Code: HealthCheckErrorCode(healthResults.incidents[i].error.code),
},
EntityInfo: GroupEntityPair{
EntityGroupId: Field_Entity_Group(healthResults.incidents[i].entityInfo.entityGroupId),
EntityId: uint(healthResults.incidents[i].entityInfo.entityId),
},
}
}

return response, nil
}

func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
name := fmt.Sprintf("health%d", rand.Uint64())
groupId, err := CreateGroup(name)
Expand All @@ -43,32 +130,28 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
return
}

err = setHealthWatches(groupId)
err = HealthSet(groupId, DCGM_HEALTH_WATCH_ALL)
if err != nil {
return
}

var healthResults C.dcgmHealthResponse_v4
healthResults.version = makeVersion4(unsafe.Sizeof(healthResults))

result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))

if err = errorString(result); err != nil {
return deviceHealth, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result}
result, err := HealthCheck(groupId)
if err != nil {
return
}

status := healthStatus(int8(healthResults.overallHealth))
status := healthStatus(result.OverallHealth)
watches := []SystemWatch{}

// number of watches that encountred error/warning
incidents := uint(healthResults.incidentCount)
incidents := len(result.Incidents)

for j := uint(0); j < incidents; j++ {
for j := 0; j < incidents; j++ {
watch := SystemWatch{
Type: systemWatch(int(healthResults.incidents[j].system)),
Status: healthStatus(int8(healthResults.incidents[j].health)),
Type: systemWatch(result.Incidents[j].System),
Status: healthStatus(result.Incidents[j].Health),

Error: *stringPtr(&healthResults.incidents[j].error.msg[0]),
Error: result.Incidents[j].Error.Message,
}
watches = append(watches, watch)
}
Expand All @@ -82,7 +165,7 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
return
}

func healthStatus(status int8) string {
func healthStatus(status HealthResult) string {
switch status {
case 0:
return "Healthy"
Expand All @@ -94,7 +177,7 @@ func healthStatus(status int8) string {
return "N/A"
}

func systemWatch(watch int) string {
func systemWatch(watch HealthSystem) string {
switch watch {
case 1:
return "PCIe watches"
Expand Down
121 changes: 121 additions & 0 deletions pkg/dcgm/health_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
//go:build linux && cgo

/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgm

import (
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestHealthWhenInvalidGroupID(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
runOnlyWithLiveGPUs(t)

var invalidGroupID uintptr = 99
gh := GroupHandle{}
gh.SetHandle(invalidGroupID)
err := HealthSet(gh, DCGM_HEALTH_WATCH_PCIE)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Setting not configured")

_, err = HealthGet(gh)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Setting not configured")

_, err = HealthGet(gh)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Setting not configured")
}

func TestHealthCheckPCIE(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)

runOnlyWithLiveGPUs(t)
gpus, err := withInjectionGPUs(t, 1)
require.NoError(t, err)

gpuID := gpus[0]

groupID, err := CreateGroup("test1")
require.NoError(t, err)
defer func() {
_ = DestroyGroup(groupID)
}()
err = AddEntityToGroup(groupID, FE_GPU, gpuID)
require.NoError(t, err)

err = HealthSet(groupID, DCGM_HEALTH_WATCH_PCIE)
require.NoError(t, err)

system, err := HealthGet(groupID)
require.NoError(t, err)
require.Equal(t, DCGM_HEALTH_WATCH_PCIE, system)

skipTestIfUnhealthy(t, groupID)

err = InjectFieldValue(gpuID,
DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
DCGM_FT_INT64,
0,
time.Now().Add(-50*time.Second).UnixMicro(),
int64(0),
)
require.NoError(t, err)

response, err := HealthCheck(groupID)
require.NoError(t, err)
require.Equal(t, DCGM_HEALTH_RESULT_PASS, response.OverallHealth)

// inject an error into PCI
err = InjectFieldValue(gpuID,
DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
DCGM_FT_INT64,
0,
time.Now().Add(100*time.Second).UnixMicro(),
int64(10),
)
require.NoError(t, err)
response, err = HealthCheck(groupID)
require.NoError(t, err)
require.Equal(t, DCGM_HEALTH_RESULT_WARN, response.OverallHealth)
require.Len(t, response.Incidents, 1)
assert.Equal(t, gpuID, response.Incidents[0].EntityInfo.EntityId)
assert.Equal(t, DCGM_HEALTH_WATCH_PCIE, response.Incidents[0].System)
assert.Equal(t, DCGM_FR_PCI_REPLAY_RATE, response.Incidents[0].Error.Code)
}

func skipTestIfUnhealthy(t *testing.T, groupId GroupHandle) {
health, err := HealthCheck(groupId)
require.NoError(t, err)
if health.OverallHealth != DCGM_HEALTH_RESULT_PASS {
msg := "Skipping health check test because we are already unhealthy: "
incidents := []string{}
for _, incident := range health.Incidents {
incidents = append(incidents, incident.Error.Message)
}

t.Skip(msg + strings.Join(incidents, ", "))
}
}
Loading

0 comments on commit f83cdef

Please sign in to comment.