Skip to content

Commit

Permalink
Merge pull request #2 from leptonai/nvidia-components
Browse files Browse the repository at this point in the history
feat(components/nvidia/query): initial commit
  • Loading branch information
gyuho authored Jul 10, 2024
2 parents 1e02bb1 + 463e54c commit 27d6014
Show file tree
Hide file tree
Showing 22 changed files with 7,968 additions and 8 deletions.
36 changes: 36 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# https://golangci-lint.run/usage/configuration/
run:
concurrency: 4
timeout: 15m

# include test files or not, default is true
tests: true

linters-settings:
gofmt:
# simplify code: gofmt with `-s` option, true by default
simplify: true
goimports:
# put imports beginning with prefix after 3rd-party packages;
# it's a comma-separated list of prefixes
local-prefixes: lepton.ai/lepton
misspell:
# Correct spellings using locale preferences for US or UK.
# Default is to use a neutral variety of English.
# Setting locale to US will correct the British spelling of 'colour' to 'color'.
locale: US

linters:
fast: false
disable-all: true
enable:
- errcheck
- gosimple
- govet
- ineffassign
- staticcheck
- unused
- gofmt
- goimports
- misspell
- unconvert
9 changes: 5 additions & 4 deletions cmd/leptond/command/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ func App() *cli.App {
},
}
app.Action = func(cliContext *cli.Context) error {
ctx := context.Background()
rootCtx, rootCancel := context.WithCancel(context.Background())
defer rootCancel()
start := time.Now()
config := defaultConfig()
signals := make(chan os.Signal, 2048)
Expand All @@ -54,18 +55,18 @@ func App() *cli.App {

log.Logger.Infof("starting leptond %v: %v", version.Version, version.Version)

done := handleSignals(ctx, signals, serverC)
done := handleSignals(rootCtx, signals, serverC)
// start the signal handler as soon as we can to make sure that
// we don't miss any signals during boot
signal.Notify(signals, handledSignals...)

server, err := lepServer.New(config)
server, err := lepServer.New(rootCtx, config)
if err != nil {
return err
}
serverC <- server

if err := notifyReady(ctx); err != nil {
if err := notifyReady(rootCtx); err != nil {
log.Logger.Warn("notify ready failed")
}
log.Logger.Infof("successfully booted in %fs", time.Since(start).Seconds())
Expand Down
149 changes: 149 additions & 0 deletions components/accelerator/nvidia/info/info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// Package info implements static information display.
package info

import (
"context"
"encoding/json"
"net/http"
"strconv"
"time"

"github.com/leptonai/leptond/components"
nvidiaquery "github.com/leptonai/leptond/components/accelerator/nvidia/query"
)

const (
// Serves relatively static information about the NVIDIA accelerator.
Name = "accelerator-nvidia-info"

StateKeyDriver = "driver"
StateKeyCUDA = "cuda"
StateKeyGPU = "gpu"
StateKeyProduct = "product"
StateKeyProductName = "name"
StateKeyProductBrand = "brand"
StateKeyProductArchitecture = "architecture"
)

func New(ctx context.Context, queryInterval time.Duration) components.Component {
cctx, ccancel := context.WithCancel(ctx)
nvidiaquery.DefaultQuerier.Start(cctx, queryInterval)
return &component{
rootCtx: ctx,
cancel: ccancel,
querier: nvidiaquery.DefaultQuerier,
}
}

var _ components.Component = (*component)(nil)

type component struct {
rootCtx context.Context
cancel context.CancelFunc
querier nvidiaquery.Querier
}

func (c *component) Name() string { return Name }

// in case we want to update the interval
// TODO: integrate with the component interface
func (c *component) Apply(cfg any) error {
qcfg, err := nvidiaquery.ParseQuerierConfig(cfg)
if err != nil {
return err
}

// stop to apply the config
// stopping querier is fine
// because it still serves the last request
c.cancel()
c.querier.Stop()

ctx, cancel := context.WithCancel(c.rootCtx)
c.cancel = cancel
c.querier.Start(ctx, qcfg.Interval)
return nil
}

func (c *component) State(ctx context.Context) ([]components.State, error) {
last := c.querier.Last()
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error,
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

cs := []components.State{
{
Name: StateKeyDriver,
Healthy: true,
ExtraInfo: map[string]string{
"version": last.Output.DriverVersion,
},
},
{
Name: StateKeyCUDA,
Healthy: true,
ExtraInfo: map[string]string{
"version": last.Output.CUDAVersion,
},
},
{
Name: StateKeyGPU,
Healthy: true,
ExtraInfo: map[string]string{
"attached": strconv.Itoa(last.Output.AttachedGPUs),
},
},
}
if len(last.Output.GPUs) > 0 {
cs = append(cs, components.State{
Name: StateKeyProduct,
Healthy: true,
ExtraInfo: map[string]string{
StateKeyProductName: last.Output.GPUs[0].ProductName,
StateKeyProductBrand: last.Output.GPUs[0].ProductBrand,
StateKeyProductArchitecture: last.Output.GPUs[0].ProductArchitecture,
},
})
}
return cs, nil
}

func (c *component) SetState(ctx context.Context, states ...components.State) error {
return nil
}

func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
return nil, nil
}

func (c *component) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
resp.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(resp).Encode(c.querier.All()); err != nil {
http.Error(resp, "failed to encode response", http.StatusInternalServerError)
return
}
}

func (c *component) HTTPHandler() http.Handler {
return c
}

func (c *component) Close() error {
// safe to call stop multiple times
c.querier.Stop()
return nil
}
13 changes: 13 additions & 0 deletions components/accelerator/nvidia/nvidia.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package nvidia

import "os/exec"

// Returns true if the local machine runs on Nvidia GPU
// by running "nvidia-smi".
func SMIExists() bool {
p, err := exec.LookPath("nvidia-smi")
if err != nil {
return false
}
return p != ""
}
Loading

0 comments on commit 27d6014

Please sign in to comment.