Skip to content

Commit

Permalink
Gracefully handle panics and invalid fields
Browse files Browse the repository at this point in the history
  • Loading branch information
rohit-arora-dev committed Feb 12, 2024
1 parent 3250bfe commit 6d8975e
Show file tree
Hide file tree
Showing 8 changed files with 356 additions and 49 deletions.
226 changes: 225 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,229 @@
dcgm-exporter
!etc/
!deployment/
.env
*.pem
*.csr
vendor/

###############################################################################
# JetBrains
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
###############################################################################
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# SonarLint plugin
.idea/sonarlint/

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

###############################################################################
# JetBrains
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
###############################################################################
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets

# Local History for Visual Studio Code
.history/

# Built Visual Studio Code Extensions
*.vsix

###############################################################################
# Sublime Text
# https://github.com/github/gitignore/blob/master/Global/SublimeText.gitignore
###############################################################################

# cache files for sublime text
*.tmlanguage.cache
*.tmPreferences.cache
*.stTheme.cache

# workspace files are user-specific
*.sublime-workspace

# project files should be checked into the repository, unless a significant
# proportion of contributors will probably not be using SublimeText
# *.sublime-project

# sftp configuration file
sftp-config.json

###############################################################################
# Vim
# https://github.com/github/gitignore/blob/master/Global/Vim.gitignore
###############################################################################

# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]

# Session
Session.vim
Sessionx.vim

# Temporary
.netrwhist
*~
# Auto-generated tag files
tags
.env
# Persistent undo
[._]*.un~

###############################################################################
# Linux
# https://github.com/github/gitignore/blob/master/Global/Linux.gitignore
###############################################################################
*~

# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*

# KDE directory preferences
.directory

# Linux trash folder which might appear on any partition or disk
.Trash-*

# .nfs files are created when an open file is removed but is still being accessed
.nfs*

###############################################################################
# OS X
# https://github.com/github/gitignore/blob/main/Global/macOS.gitignore
###############################################################################
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

###############################################################################
# Windows
# https://github.com/github/gitignore/blob/master/Global/Windows.gitignore
###############################################################################
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db

# Dump file
*.stackdump

# Folder config file
[Dd]esktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp

# Windows shortcuts
*.lnk
21 changes: 0 additions & 21 deletions .vscode/launch.json

This file was deleted.

4 changes: 0 additions & 4 deletions .vscode/settings.json

This file was deleted.

17 changes: 14 additions & 3 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,19 @@ import (
"os"
"os/signal"
"runtime"
"runtime/debug"
"strconv"
"strings"
"sync"
"syscall"
"text/template"
"time"

"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"

"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
)

const (
Expand Down Expand Up @@ -226,17 +228,26 @@ func newOSWatcher(sigs ...os.Signal) chan os.Signal {
return sigChan
}

func action(c *cli.Context) error {
func action(c *cli.Context) (err error) {
restart:

// The purpose of this function is to capture any panic that may occur
// during initialization and return an error.
defer func() {
if r := recover(); r != nil {
logrus.WithField("Stack trace", string(debug.Stack())).Error("encountered a failure: %v")
err = fmt.Errorf("encountered a failure: %v", r)
}
}()

logrus.Info("Starting dcgm-exporter")
config, err := contextToConfig(c)
if err != nil {
return err
}

if config.Debug {
//enable debug logging
// enable debug logging
logrus.SetLevel(logrus.DebugLevel)
logrus.Debug("Debug output is enabled")
}
Expand Down
22 changes: 12 additions & 10 deletions pkg/dcgmexporter/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,28 @@ const (
DCGMXIDErrorsCount DCGMExporterMetric = iota + 9000
)

// DCGMFields maps DCGMExporterMetric String to enum
var DCGMFields = map[string]DCGMExporterMetric{
DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount,
DCGMFIUnknown.String(): DCGMFIUnknown,
}

// String method to convert the enum value to a string
func (enm DCGMExporterMetric) String() string {
switch enm {
func (d DCGMExporterMetric) String() string {
switch d {
case DCGMXIDErrorsCount:
return "DCGM_EXP_XID_ERRORS_COUNT"
default:
return "DCGM_FI_UNKNOWN"
}
}

func mustParseDCGMExporterMetric(s string) DCGMExporterMetric {
metrics := map[string]DCGMExporterMetric{
DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount,
DCGMFIUnknown.String(): DCGMFIUnknown,
}
mv, ok := metrics[s]
func IdentifyMetricType(s string) (DCGMExporterMetric, error) {
mv, ok := DCGMFields[s]
if !ok {
panic(fmt.Sprintf(`cannot parse:[%s] as DCGMExporterMetric`, s))
return mv, fmt.Errorf("unknown DCGMExporterMetric field '%s'", s)
}
return mv
return mv, nil
}

// Constants for logging fields
Expand Down
Loading

0 comments on commit 6d8975e

Please sign in to comment.