From 0a1d30dda5090f5482091097777016c4898a624d Mon Sep 17 00:00:00 2001 From: Alan Donovan Date: Tue, 30 Jan 2024 15:56:28 -0500 Subject: [PATCH] crashmonitor: a monitor for unexpected crashes This change defines a crash monitor that increments a telemetry counter for the stack any time the Go runtime crashes. It relies on the runtime/debug.SetCrashOutput feature of go1.23 (#42888); if built with an earlier version of Go, the monitor is a no-op. This feature causes the Go runtime to write the crash to an arbitrary file instead of stderr. In this case, the file is a pipe to a subprocess that executes the application executable in monitor mode. When the pipe is closed, the monitor parses the program counters out of the backtrace, then uses a the usual logic to convert it to the name of a counter, which it then increments. The CL includes two tests, an internal test that parses a crash from the child process's stderr, and an integration test that uses crashmonitor.Start and SetCrashOutput, if available. Updates golang/go#42888 Change-Id: Ie40f9403fa99a9e0bab3b4edc9430be5e18150d7 Reviewed-on: https://go-review.googlesource.com/c/telemetry/+/559503 Auto-Submit: Alan Donovan LUCI-TryBot-Result: Go LUCI Reviewed-by: Robert Findley --- crashmonitor/crash_go123.go | 14 ++ crashmonitor/monitor.go | 294 +++++++++++++++++++++++++++++++ crashmonitor/monitor_test.go | 161 +++++++++++++++++ internal/counter/parse.go | 35 +--- internal/counter/stackcounter.go | 62 +++++-- 5 files changed, 522 insertions(+), 44 deletions(-) create mode 100644 crashmonitor/crash_go123.go create mode 100644 crashmonitor/monitor.go create mode 100644 crashmonitor/monitor_test.go diff --git a/crashmonitor/crash_go123.go b/crashmonitor/crash_go123.go new file mode 100644 index 0000000..2e0c1b3 --- /dev/null +++ b/crashmonitor/crash_go123.go @@ -0,0 +1,14 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.23 +// +build go1.23 + +package crashmonitor + +import "runtime/debug" + +func init() { + setCrashOutput = debug.SetCrashOutput +} diff --git a/crashmonitor/monitor.go b/crashmonitor/monitor.go new file mode 100644 index 0000000..39689a2 --- /dev/null +++ b/crashmonitor/monitor.go @@ -0,0 +1,294 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package crashmonitor + +// This file defines a monitor that reports arbitrary Go runtime +// crashes to telemetry. + +import ( + "bytes" + "fmt" + "io" + "log" + "os" + "os/exec" + "reflect" + "runtime/debug" + "strconv" + "strings" + + "golang.org/x/telemetry/internal/counter" +) + +// Supported reports whether the runtime supports [runtime.SetCrashOutput]. +// +// TODO(adonovan): eliminate once go1.23+ is assured. +func Supported() bool { return setCrashOutput != nil } + +var setCrashOutput func(*os.File) error // = runtime.SetCrashOutput on go1.23+ + +// Start starts the monitor process, which performs automated +// reporting of unexpected crashes via Go telemetry. Call this +// function once immediately after [counter.Open]() within the main +// function of your application, before argument parsing. +// +// This function re-executes the current executable as a child +// process, in a special mode. In that mode, the call to Start will +// never return. +// +// The application should avoid doing expensive work in init functions +// as they will be executed twice. Run with GODEBUG=inittrace=1 to +// display the running time of each package initializer. +// +// Start uses the [debug.SetCrashOutput] mechanism, which is a +// process-wide resource. Do not make other calls to that function +// within your application. Start is a no-op unless the program is +// built with go1.23+. +func Start() { + if !Supported() { + return + } + + const crashmonitorVar = "X_TELEMETRY_CRASHMONITOR" + if os.Getenv(crashmonitorVar) != "" { + // This process is the crashmonitor (child). + log.SetFlags(0) + log.SetPrefix("crashmonitor: ") + + // Wait for parent process's dying gasp. + // If the parent dies for any reason this read will return. + data, err := io.ReadAll(os.Stdin) + if err != nil { + log.Fatalf("failed to read from input pipe: %v", err) + } + + // If the only line is the sentinel, it wasn't a crash. + if bytes.Count(data, []byte("\n")) < 2 { + os.Exit(0) // parent exited without crash report + } + + log.Printf("parent reported crash:\n%s", data) + + // Parse the stack out of the crash report + // and record a telemetry count for it. + name, err := telemetryCounterName(data) + if err != nil { + // Keep count of how often this happens + // so that we can investigate if necessary. + incrementCounter("crash/malformed") + + // Something went wrong. + // Save the crash securely in the file system. + f, err := os.CreateTemp(os.TempDir(), "*.crash") + if err != nil { + log.Fatal(err) + } + if _, err := f.Write(data); err != nil { + log.Fatal(err) + } + if err := f.Close(); err != nil { + log.Fatal(err) + } + log.Printf("failed to report crash to telemetry: %v", err) + log.Fatalf("crash report saved at %s", f.Name()) + } + + incrementCounter(name) + + log.Fatalf("telemetry crash recorded") + } + + // This process is the application (parent). + // Fork+exec the crashmonitor (child). + exe, err := os.Executable() + if err != nil { + log.Fatal(err) + } + cmd := exec.Command(exe, "** crashmonitor **") // this unused arg is just for ps(1) + cmd.Env = append(os.Environ(), crashmonitorVar+"=1") + cmd.Stderr = os.Stderr + cmd.Stdout = os.Stderr + pipe, err := cmd.StdinPipe() + if err != nil { + log.Fatalf("StdinPipe: %v", err) + } + + writeSentinel(pipe) + // Ensure that we get pc=0x%x values in the traceback. + debug.SetTraceback("system") + setCrashOutput(pipe.(*os.File)) // (this conversion is safe) + + if err := cmd.Start(); err != nil { + log.Fatalf("can't start crash monitor: %v", err) + } + + // Now return and run the application proper... +} + +// (stubbed by test) +var incrementCounter = func(name string) { counter.New(name).Inc() } + +// The sentinel function returns its address. The difference between +// this value as observed by calls in two different processes of the +// same executable tells us the relative offset of their text segments. +// +// It would be nice if SetCrashOutput took care of this as it's fiddly +// and likely to confuse every user at first. +func sentinel() uint64 { + return uint64(reflect.ValueOf(sentinel).Pointer()) +} + +func writeSentinel(out io.Writer) { + fmt.Fprintf(out, "sentinel %x\n", sentinel()) +} + +// telemetryCounterName parses a crash report produced by the Go +// runtime, extracts the stack of the first runnable goroutine, +// converts each line into telemetry form ("symbol:relative-line"), +// and returns this as the name of a counter. +func telemetryCounterName(crash []byte) (string, error) { + pcs, err := parseStackPCs(string(crash)) + if err != nil { + return "", err + } + + // Limit the number of frames we request. + pcs = pcs[:min(len(pcs), 16)] + + if len(pcs) == 0 { + // This can occur if all goroutines are idle, as when + // caught in a deadlock, or killed by an async signal + // while blocked. + // + // TODO(adonovan): consider how to report such + // situations. Reporting a goroutine in [sleep] or + // [select] state could be quite confusing without + // further information about the nature of the crash, + // as the problem is not local to the code location. + // + // For now, we keep count of this situation so that we + // can access whether it needs a more involved solution. + return "crash/no-running-goroutine", nil + } + + // This string appears at the start of all + // crashmonitor-generated counter names. + // + // It is tempting to expose this as a parameter of Start, but + // it is not without risk. What value should most programs + // provide? There's no point giving the name of the executable + // as this is already recorded by telemetry. What if the + // application runs in multiple modes? Then it might be useful + // to record the mode. The problem is that an application with + // multiple modes probably doesn't know its mode by line 1 of + // main.main: it might require flag or argument parsing, or + // even validation of an environment variable, and we really + // want to steer users aware from any logic before Start. The + // flags and arguments will be wrong in the child process, and + // every extra conditional branch creates a risk that the + // recursively executed child program will behave not like the + // monitor but like the application. If the child process + // exits before calling Start, then the parent application + // will not have a monitor, and its crash reports will be + // discarded (written in to a pipe that is never read). + // + // So for now, we use this constant string. + const prefix = "crash/crash" + return counter.EncodeStack(pcs, prefix), nil +} + +// parseStackPCs parses the parent process's program counters for the +// first running goroutine out of a GOTRACEBACK=system traceback, +// adjusting them so that they are valid for the child process's text +// segment. +// +// This function returns only program counter values, ensuring that +// there is no possibility of strings from the crash report (which may +// contain PII) leaking into the telemetry system. +func parseStackPCs(crash string) ([]uintptr, error) { + // getPC parses the PC out of a line of the form: + // \tFILE:LINE +0xRELPC sp=... fp=... pc=... + getPC := func(line string) (uint64, error) { + _, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x + if !ok { + return 0, fmt.Errorf("no pc= for stack frame: %s", line) + } + return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix + } + + var ( + pcs []uintptr + parentSentinel uint64 + childSentinel = sentinel() + on = false // are we in the first running goroutine? + lines = strings.Split(crash, "\n") + ) + for i := 0; i < len(lines); i++ { + line := lines[i] + + // Read sentinel value. + if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") { + _, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel) + if err != nil { + return nil, fmt.Errorf("can't read sentinel line") + } + continue + } + + // Search for "goroutine GID [STATUS]" + if !on { + if strings.HasPrefix(line, "goroutine ") && + strings.Contains(line, " [running]:") { + on = true + + if parentSentinel == 0 { + return nil, fmt.Errorf("no sentinel value in crash report") + } + } + continue + } + + // A blank line marks end of a goroutine stack. + if line == "" { + break + } + + // Skip the final "created by SYMBOL in goroutine GID" part. + if strings.HasPrefix(line, "created by ") { + break + } + + // Expect a pair of lines: + // SYMBOL(ARGS) + // \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x + // Note: SYMBOL may contain parens "pkg.(*T).method" + // The RELPC is sometimes missing. + + // Skip the symbol(args) line. + i++ + if i == len(lines) { + break + } + line = lines[i] + + // Parse the PC, and correct for the parent and child's + // different mappings of the text section. + pc, err := getPC(line) + if err != nil { + // Inlined frame, perhaps; skip it. + continue + } + pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel)) + } + return pcs, nil +} + +func min(x, y int) int { + if x < y { + return x + } else { + return y + } +} diff --git a/crashmonitor/monitor_test.go b/crashmonitor/monitor_test.go new file mode 100644 index 0000000..81fe8c2 --- /dev/null +++ b/crashmonitor/monitor_test.go @@ -0,0 +1,161 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package crashmonitor + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime/debug" + "strings" + "testing" + + "golang.org/x/telemetry/internal/counter" +) + +func TestMain(m *testing.M) { + entry := os.Getenv("CRASHMONITOR_TEST_ENTRYPOINT") + switch entry { + case "via-stderr": + // This mode bypasses Start and debug.SetCrashOutput; + // the crash is printed to stderr. + debug.SetTraceback("system") + writeSentinel(os.Stderr) + + child() // this line is "TestMain:9" + panic("unreachable") + + case "start.panic", "start.exit": + // These modes uses Start and debug.SetCrashOutput. + // We stub the actual telemetry by instead writing to a file. + incrementCounter = func(name string) { + os.WriteFile(os.Getenv("TELEMETRY_FILE"), []byte(name), 0666) + } + Start() + if entry == "start.panic" { + go func() { + child() // this line is "TestMain.func2:1" + }() + select {} // deadlocks when reached + } else { + os.Exit(42) + } + + default: + os.Exit(m.Run()) // run tests as normal + } +} + +//go:noinline +func child() { + fmt.Println("hello") + grandchild() // this line is "child:2" +} + +//go:noinline +func grandchild() { + panic("oops") // this line is "grandchild:1" +} + +// TestViaStderr is an internal test that asserts that the telemetry +// stack generated by the panic in grandchild is correct. It uses +// stderr, and does not rely on [Start] or [debug.SetCrashOutput]. +func TestViaStderr(t *testing.T) { + _, stderr := runSelf(t, "via-stderr") + got, err := telemetryCounterName(stderr) + if err != nil { + t.Fatal(err) + } + got = sanitize(counter.DecodeStack(got)) + want := "crash/crash\n" + + "runtime.gopanic:--\n" + + "golang.org/x/telemetry/crashmonitor.grandchild:1\n" + + "golang.org/x/telemetry/crashmonitor.child:2\n" + + "golang.org/x/telemetry/crashmonitor.TestMain:9\n" + + "main.main:--\n" + + "runtime.main:--\n" + + "runtime.goexit:--" + if got != want { + t.Errorf("got counter name <<%s>>, want <<%s>>", got, want) + } +} + +// TestStart is an integration test of [Start]. Requires go1.23+. +func TestStart(t *testing.T) { + if !Supported() { + t.Skip("crashmonitor not supported") + } + + // Assert that the crash monitor does nothing when the child + // process merely exits. + t.Run("exit", func(t *testing.T) { + telemetryFile, _ := runSelf(t, "start.exit") + data, err := os.ReadFile(telemetryFile) + if err == nil { + t.Fatalf("telemetry counter <<%s>> was unexpectedly incremented", data) + } + }) + + // Assert that the crash monitor increments a telemetry + // counter of the correct name when the child process panics. + t.Run("panic", func(t *testing.T) { + // Gather a stack trace from executing the panic statement above. + telemetryFile, _ := runSelf(t, "start.panic") + data, err := os.ReadFile(telemetryFile) + if err != nil { + t.Fatalf("failed to read file: %v", err) + } + got := sanitize(counter.DecodeStack(string(data))) + want := "crash/crash\n" + + "runtime.gopanic:--\n" + + "golang.org/x/telemetry/crashmonitor.grandchild:1\n" + + "golang.org/x/telemetry/crashmonitor.child:2\n" + + "golang.org/x/telemetry/crashmonitor.TestMain.func2:1\n" + + "runtime.goexit:--" + if got != want { + t.Errorf("got counter name <<%s>>, want <<%s>>", got, want) + } + }) +} + +// runSelf fork+exec's this test executable using an alternate entry point. +// It returns the child's stderr, and the name of the file +// to which any incremented counter name will be written. +func runSelf(t *testing.T, entrypoint string) (string, []byte) { + exe, err := os.Executable() + if err != nil { + t.Fatal(err) + } + + // Provide the name of the file to the child via the environment. + // The telemetry operation may be stubbed to write to this file. + telemetryFile := filepath.Join(t.TempDir(), "fake.telemetry") + + cmd := exec.Command(exe) + cmd.Env = append(os.Environ(), + "CRASHMONITOR_TEST_ENTRYPOINT="+entrypoint, + "TELEMETRY_FILE="+telemetryFile) + cmd.Stderr = new(bytes.Buffer) + cmd.Run() // failure is expected + stderr := cmd.Stderr.(*bytes.Buffer).Bytes() + if true { // debugging + t.Logf("stderr: %s", stderr) + } + return telemetryFile, stderr +} + +// sanitize redacts the line numbers that we don't control from a counter name. +func sanitize(name string) string { + lines := strings.Split(name, "\n") + for i, line := range lines { + if symbol, _, ok := strings.Cut(line, ":"); ok && + !strings.HasPrefix(line, "golang.org/x/telemetry/crashmonitor") { + lines[i] = symbol + ":--" + } + } + return strings.Join(lines, "\n") +} diff --git a/internal/counter/parse.go b/internal/counter/parse.go index ee896a4..4280c1e 100644 --- a/internal/counter/parse.go +++ b/internal/counter/parse.go @@ -88,43 +88,10 @@ func Parse(filename string, data []byte) (*File, error) { if _, ok := f.Count[string(ename)]; ok { return corrupt() } - ctrName := expandName(ename) + ctrName := DecodeStack(string(ename)) f.Count[ctrName] = v.Load() off = next } } return f, nil } - -func expandName(ename []byte) string { - if !bytes.Contains(ename, []byte{'\n'}) { - // not a stack counter - return string(ename) - } - lines := bytes.Split(ename, []byte{'\n'}) - var lastPath []byte // empty or ends with . - for i, line := range lines { - path, rest := splitLine(line) - if len(path) == 0 { - continue // unchanged - } - if len(path) == 1 && path[0] == '"' { - path = append([]byte{}, lastPath...) //need a deep copy - lines[i] = append(path, rest...) - } else { - lastPath = append(path, '.') - // line unchanged - } - } - return string(bytes.Join(lines, []byte{'\n'})) // trailing \n? -} - -// input is . -// output is (import path, function name) -func splitLine(x []byte) ([]byte, []byte) { - i := bytes.LastIndex(x, []byte{'.'}) - if i < 0 { - return []byte{}, x - } - return x[:i], x[i+1:] -} diff --git a/internal/counter/stackcounter.go b/internal/counter/stackcounter.go index 25b37b3..793e7a8 100644 --- a/internal/counter/stackcounter.go +++ b/internal/counter/stackcounter.go @@ -44,18 +44,38 @@ func (c *StackCounter) Inc() { pcs := make([]uintptr, c.depth) n := runtime.Callers(2, pcs) // caller of Inc pcs = pcs[:n] + c.mu.Lock() defer c.mu.Unlock() + + // Existing counter? + var ctr *Counter for _, s := range c.stacks { if eq(s.pcs, pcs) { if s.counter != nil { - s.counter.Inc() + ctr = s.counter + break } - return } } - // have to create the new counter's name, and the new counter itself - locs := make([]string, 0, c.depth) + + if ctr == nil { + // Create new counter. + ctr = &Counter{ + name: EncodeStack(pcs, c.name), + file: c.file, + } + c.stacks = append(c.stacks, stack{pcs: pcs, counter: ctr}) + } + + ctr.Inc() +} + +// EncodeStack returns the name of the counter to +// use for the given stack of program counters. +// The name encodes the stack. +func EncodeStack(pcs []uintptr, prefix string) string { + var locs []string lastImport := "" frs := runtime.CallersFrames(pcs) for i := 0; ; i++ { @@ -63,7 +83,9 @@ func (c *StackCounter) Inc() { pcline := fr.Line entryptr := fr.Entry var locline string - path, fname := splitPath(fr.Function) + // TODO(adonovan): this CutLast(".") operation isn't + // appropriate for generic function symbols. + path, fname := cutLastDot(fr.Function) if path == lastImport { path = "\"" } else { @@ -89,19 +111,39 @@ func (c *StackCounter) Inc() { } } - name := c.name + "\n" + strings.Join(locs, "\n") + name := prefix + "\n" + strings.Join(locs, "\n") if len(name) > maxNameLen { const bad = "\ntruncated\n" name = name[:maxNameLen-len(bad)] + bad } - ctr := &Counter{name: name, file: c.file} - c.stacks = append(c.stacks, stack{pcs: pcs, counter: ctr}) - ctr.Inc() + return name +} + +// DecodeStack expands the (compressed) stack encoded in the counter name. +func DecodeStack(ename string) string { + if !strings.Contains(ename, "\n") { + return ename // not a stack counter + } + lines := strings.Split(ename, "\n") + var lastPath string // empty or ends with . + for i, line := range lines { + path, rest := cutLastDot(line) + if len(path) == 0 { + continue // unchanged + } + if len(path) == 1 && path[0] == '"' { + lines[i] = lastPath + rest + } else { + lastPath = path + "." + // line unchanged + } + } + return strings.Join(lines, "\n") // trailing \n? } // input is . // output is (import path, function name) -func splitPath(x string) (string, string) { +func cutLastDot(x string) (before, after string) { i := strings.LastIndex(x, ".") if i < 0 { return "", x