From 02d98b9357b36804a6008c3c85ef3b4eca55c819 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 25 Mar 2024 09:01:06 -0400 Subject: [PATCH] operator debug: fix pprof interval handling (#20206) The `nomad operator debug` command saves a CPU profile for each interval, and names these files based on the interval. The same functions takes a goroutine profile, heap profile, etc. but is missing the logic to interpolate the file name with the interval. This results in the operator debug command making potentially many expensive profile requests, and then overwriting the data. Update the command to save every profile it scrapes, and number them similarly to the existing CPU profile. Additionally, the command flags for `-pprof-interval` and `-pprof-duration` were validated backwards, which meant that we always coerced the `-pprof-interval` to be the same as the `-pprof-duration`, which always resulted in a single profile being taken at the start of the bundle. Correct the check as well as change the defaults to be more sensible. Fixes: https://github.com/hashicorp/nomad/issues/20151 --- .changelog/20206.txt | 3 ++ command/operator_debug.go | 28 +++++++++---------- command/operator_debug_test.go | 14 +++++----- .../content/docs/commands/operator/debug.mdx | 7 ++--- 4 files changed, 27 insertions(+), 25 deletions(-) create mode 100644 .changelog/20206.txt diff --git a/.changelog/20206.txt b/.changelog/20206.txt new file mode 100644 index 00000000000..f0c27a09e15 --- /dev/null +++ b/.changelog/20206.txt @@ -0,0 +1,3 @@ +```release-note:bug +cli: Fixed a bug where `operator debug` did not respect the `-pprof-interval` flag and would take only one profile +``` diff --git a/command/operator_debug.go b/command/operator_debug.go index c5c87ad3f16..ed3224758f7 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -200,8 +200,8 @@ Debug Options: -pprof-interval= The interval between pprof collections. Set interval equal to - duration to capture a single snapshot. Defaults to 250ms or - -pprof-duration, whichever is less. + duration to capture a single snapshot. Defaults to 30s or + -pprof-duration, whichever is more. -server-id=, Comma separated list of Nomad server names to monitor for logs, API @@ -372,7 +372,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags.BoolVar(&allowStale, "stale", false, "") flags.StringVar(&output, "output", "", "") flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") - flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "") + flags.StringVar(&pprofInterval, "pprof-interval", "30s", "") flags.BoolVar(&c.verbose, "verbose", false, "") c.consul = &external{tls: &api.TLSConfig{}} @@ -439,7 +439,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error())) return 1 } - if pi.Seconds() > pd.Seconds() { + if pi.Seconds() < pd.Seconds() { pi = pd } c.pprofInterval = pi @@ -1036,27 +1036,27 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // goroutine debug type 1 = legacy text format for human readable output opts.Debug = 1 - c.savePprofProfile(path, "goroutine", opts, client) + c.savePprofProfile(path, "goroutine", opts, client, interval) // goroutine debug type 2 = goroutine stacks in panic format opts.Debug = 2 - c.savePprofProfile(path, "goroutine", opts, client) + c.savePprofProfile(path, "goroutine", opts, client, interval) // Reset to pprof binary format opts.Debug = 0 - c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines - c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program - c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. - c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations - c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads + c.savePprofProfile(path, "goroutine", opts, client, interval) // Stack traces of all current goroutines + c.savePprofProfile(path, "trace", opts, client, interval) // A trace of execution of the current program + c.savePprofProfile(path, "heap", opts, client, interval) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. + c.savePprofProfile(path, "allocs", opts, client, interval) // A sampling of all past memory allocations + c.savePprofProfile(path, "threadcreate", opts, client, interval) // Stack traces that led to the creation of new OS threads } // savePprofProfile retrieves a pprof profile and writes to disk -func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) { - fileName := fmt.Sprintf("%s.prof", profile) +func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client, interval int) { + fileName := fmt.Sprintf("%s_%04d.prof", profile, interval) if opts.Debug > 0 { - fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug) + fileName = fmt.Sprintf("%s-debug%d_%04d.txt", profile, opts.Debug, interval) } bs, err := retrievePprofProfile(profile, opts, client, c.queryOpts()) diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 364f7c022cb..4e75e675e6c 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -452,14 +452,14 @@ func TestDebug_CapturedFiles(t *testing.T) { } pprofFiles := []string{ - "allocs.prof", - "goroutine-debug1.txt", - "goroutine-debug2.txt", - "goroutine.prof", - "heap.prof", + "allocs_0000.prof", + "goroutine-debug1_0000.txt", + "goroutine-debug2_0000.txt", + "goroutine_0000.prof", + "heap_0000.prof", "profile_0000.prof", - "threadcreate.prof", - "trace.prof", + "threadcreate_0000.prof", + "trace_0000.prof", } clientFiles := []string{ diff --git a/website/content/docs/commands/operator/debug.mdx b/website/content/docs/commands/operator/debug.mdx index 362bdfe1060..8ffde93526f 100644 --- a/website/content/docs/commands/operator/debug.mdx +++ b/website/content/docs/commands/operator/debug.mdx @@ -70,10 +70,9 @@ true. - `-pprof-duration=`: Duration for pprof collection. Defaults to 1s or `-duration`, whichever is less. -- `-pprof-interval=`: The interval between pprof - collections. Set interval equal to duration to capture a single - snapshot. Defaults to 250ms or `-pprof-duration`, whichever is - less. +- `-pprof-interval=30s`: The interval between pprof collections. Set interval + equal to pprof duration to capture a single snapshot. Defaults to 30s or + `-pprof-duration`, whichever is more. - `-server-id=,`: Comma separated list of Nomad server names to monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or