Skip to content

Commit

Permalink
internal/crashmonitor: adjust trap frames
Browse files Browse the repository at this point in the history
Typically all frames in the middle of a stack trace are calls, and
report the "return PC". That is, the instruction following the CALL
where the callee will eventually return to.

runtime.CallersFrames is aware of this property and will decrement each
PC by 1 to "back up" to the location of the CALL, which is the actual
line number the user expects.

This does not work for traps (such as a SIGSEGV), as a trap is not a
call, so the reported PC is not the return PC, but the actual PC of the
trap.

runtime.Callers is aware of this and will intentionally increment trap
PCs in order to correct for the decrement performed by
runtime.CallersFrames. See runtime.tracebackPCs and
runtume.(*unwinder).symPC.

We must emulate the same behavior, otherwise we will report the location
of the instruction immediately prior to the trap, which may be on a
different line, or even a different inlined functions.

Fixes golang/go#70637.

Change-Id: I7cd6ffe783a81aa91dcdca313d910341c0bb0440
Reviewed-on: https://go-review.googlesource.com/c/telemetry/+/637755
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
  • Loading branch information
prattmic committed Dec 20, 2024
1 parent 773b2e3 commit cc96b6e
Show file tree
Hide file tree
Showing 2 changed files with 217 additions and 50 deletions.
113 changes: 100 additions & 13 deletions internal/crashmonitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,35 @@ func telemetryCounterName(crash []byte) (string, error) {
// there is no possibility of strings from the crash report (which may
// contain PII) leaking into the telemetry system.
func parseStackPCs(crash string) ([]uintptr, error) {
// getSymbol parses the symbol name out of a line of the form:
// SYMBOL(ARGS)
//
// Note: SYMBOL may contain parens "pkg.(*T).method". However, type
// parameters are always replaced with ..., so they cannot introduce
// more parens. e.g., "pkg.(*T[...]).method".
//
// ARGS can contain parens. We want the first paren that is not
// immediately preceded by a ".".
//
// TODO(prattmic): This is mildly complicated and is only used to find
// runtime.sigpanic, so perhaps simplify this by checking explicitly
// for sigpanic.
getSymbol := func(line string) (string, error) {
var prev rune
for i, c := range line {
if line[i] != '(' {
prev = c
continue
}
if prev == '.' {
prev = c
continue
}
return line[:i], nil
}
return "", fmt.Errorf("no symbol for stack frame: %s", line)
}

// getPC parses the PC out of a line of the form:
// \tFILE:LINE +0xRELPC sp=... fp=... pc=...
getPC := func(line string) (uint64, error) {
Expand All @@ -186,6 +215,9 @@ func parseStackPCs(crash string) ([]uintptr, error) {
childSentinel = sentinel()
on = false // are we in the first running goroutine?
lines = strings.Split(crash, "\n")
symLine = true // within a goroutine, every other line is a symbol or file/line/pc location, starting with symbol.
currSymbol string
prevSymbol string // symbol of the most recent previous frame with a PC.
)
for i := 0; i < len(lines); i++ {
line := lines[i]
Expand Down Expand Up @@ -228,21 +260,76 @@ func parseStackPCs(crash string) ([]uintptr, error) {
// Note: SYMBOL may contain parens "pkg.(*T).method"
// The RELPC is sometimes missing.

// Skip the symbol(args) line.
i++
if i == len(lines) {
break
}
line = lines[i]
if symLine {
var err error
currSymbol, err = getSymbol(line)
if err != nil {
return nil, fmt.Errorf("error extracting symbol: %v", err)
}

// Parse the PC, and correct for the parent and child's
// different mappings of the text section.
pc, err := getPC(line)
if err != nil {
// Inlined frame, perhaps; skip it.
continue
symLine = false // Next line is FILE:LINE.
} else {
// Parse the PC, and correct for the parent and child's
// different mappings of the text section.
pc, err := getPC(line)
if err != nil {
// Inlined frame, perhaps; skip it.

// Done with this frame. Next line is a new frame.
//
// Don't update prevSymbol; we only want to
// track frames with a PC.
currSymbol = ""
symLine = true
continue
}

pc = pc-parentSentinel+childSentinel

// If the previous frame was sigpanic, then this frame
// was a trap (e.g., SIGSEGV).
//
// Typically all middle frames are calls, and report
// the "return PC". That is, the instruction following
// the CALL where the callee will eventually return to.
//
// runtime.CallersFrames is aware of this property and
// will decrement each PC by 1 to "back up" to the
// location of the CALL, which is the actual line
// number the user expects.
//
// This does not work for traps, as a trap is not a
// call, so the reported PC is not the return PC, but
// the actual PC of the trap.
//
// runtime.Callers is aware of this and will
// intentionally increment trap PCs in order to correct
// for the decrement performed by
// runtime.CallersFrames. See runtime.tracebackPCs and
// runtume.(*unwinder).symPC.
//
// We must emulate the same behavior, otherwise we will
// report the location of the instruction immediately
// prior to the trap, which may be on a different line,
// or even a different inlined functions.
//
// TODO(prattmic): The runtime applies the same trap
// behavior for other "injected calls", see injectCall
// in runtime.(*unwinder).next. Do we want to handle
// those as well? I don't believe we'd ever see
// runtime.asyncPreempt or runtime.debugCallV2 in a
// typical crash.
if prevSymbol == "runtime.sigpanic" {
pc++
}

pcs = append(pcs, uintptr(pc))

// Done with this frame. Next line is a new frame.
prevSymbol = currSymbol
currSymbol = ""
symLine = true
}
pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel))
}
return pcs, nil
}
Expand Down
154 changes: 117 additions & 37 deletions internal/crashmonitor/monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,20 @@ import (
func TestMain(m *testing.M) {
entry := os.Getenv("CRASHMONITOR_TEST_ENTRYPOINT")
switch entry {
case "via-stderr":
case "via-stderr.panic", "via-stderr.trap":
// This mode bypasses Start and debug.SetCrashOutput;
// the crash is printed to stderr.
debug.SetTraceback("system")
crashmonitor.WriteSentinel(os.Stderr)

child() // this line is "TestMain:+9"
if entry == "via-stderr.panic" {
childPanic() // this line is "TestMain:+10"
} else {
childTrap() // this line is "TestMain:+12"
}
panic("unreachable")

case "start.panic", "start.exit":
case "start.panic", "start.trap", "start.exit":
// These modes uses Start and debug.SetCrashOutput.
// We stub the actual telemetry by instead writing to a file.
crashmonitor.SetIncrementCounter(func(name string) {
Expand All @@ -46,12 +50,18 @@ func TestMain(m *testing.M) {
ReportCrashes: true,
TelemetryDir: os.Getenv("CRASHMONITOR_TELEMETRY_DIR"),
})
if entry == "start.panic" {
switch entry {
case "start.panic":
go func() {
child() // this line is "TestMain.func2:1"
childPanic() // this line is "TestMain.func3:+1"
}()
select {} // deadlocks when reached
} else {
case "start.trap":
go func() {
childTrap() // this line is "TestMain.func4:+1"
}()
select {} // deadlocks when reached
case "start.exit":
os.Exit(42)
}

Expand All @@ -60,48 +70,94 @@ func TestMain(m *testing.M) {
}
}

func child() {
func childPanic() {
fmt.Println("hello")
grandchildPanic() // this line is "childPanic:+2"
}

func grandchildPanic() {
panic("oops") // this line is "grandchildPanic:=79" (the call from child is inlined)
}

var sinkPtr *int

func childTrap() {
fmt.Println("hello")
grandchild() // this line is "child:+2"
grandchildTrap(sinkPtr) // this line is "childTrap:+2"
}

func grandchild() {
panic("oops") // this line is "grandchild:=92" (the call from child is inlined)
func grandchildTrap(i *int) {
*i = 42 // this line is "grandchildTrap:=90" (the call from childTrap is inlined)
}

// TestViaStderr is an internal test that asserts that the telemetry
// stack generated by the panic in grandchild is correct. It uses
// stderr, and does not rely on [start.Start] or [debug.SetCrashOutput].
func TestViaStderr(t *testing.T) {
_, _, stderr := runSelf(t, "via-stderr")
got, err := crashmonitor.TelemetryCounterName(stderr)
if err != nil {
t.Fatal(err)
}
got = sanitize(counter.DecodeStack(got))
want := "crash/crash\n" +
// Standard panic.
t.Run("panic", func(t *testing.T) {
_, _, stderr := runSelf(t, "via-stderr.panic")
got, err := crashmonitor.TelemetryCounterName(stderr)
if err != nil {
t.Fatal(err)
}
got = sanitize(counter.DecodeStack(got))
want := "crash/crash\n" +
"runtime.gopanic:--\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.grandchild:=69\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.child:+2\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.TestMain:+9\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.grandchildPanic:=79\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.childPanic:+2\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.TestMain:+10\n" +
"main.main:--\n" +
"runtime.main:--\n" +
"runtime.goexit:--"

if !crashmonitor.Supported() { // !go1.23
// Traceback excludes PCs for inlined frames. Before go1.23
// (https://go.dev/cl/571798 specifically), passing the set of
// PCs in the traceback to runtime.CallersFrames, would report
// only the innermost inlined frame and none of the inline
// "callers".
//
// Thus, here we must drop the caller of the inlined frame.
want = strings.ReplaceAll(want, "golang.org/x/telemetry/internal/crashmonitor_test.child:+2\n", "")
}
if !crashmonitor.Supported() { // !go1.23
// Traceback excludes PCs for inlined frames. Before
// go1.23 (https://go.dev/cl/571798 specifically),
// passing the set of PCs in the traceback to
// runtime.CallersFrames, would report only the
// innermost inlined frame and none of the inline
// "callers".
//
// Thus, here we must drop the caller of the inlined
// frame.
want = strings.ReplaceAll(want, "golang.org/x/telemetry/internal/crashmonitor_test.childPanic:+2\n", "")
}

if got != want {
t.Errorf("got counter name <<%s>>, want <<%s>>", got, want)
}
if got != want {
t.Errorf("got counter name <<%s>>, want <<%s>>", got, want)
}
})

// Panic via trap.
t.Run("trap", func(t *testing.T) {
_, _, stderr := runSelf(t, "via-stderr.trap")
got, err := crashmonitor.TelemetryCounterName(stderr)
if err != nil {
t.Fatal(err)
}
got = sanitize(counter.DecodeStack(got))
want := "crash/crash\n" +
"runtime.gopanic:--\n" +
"runtime.panicmem:--\n" +
"runtime.sigpanic:--\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.grandchildTrap:=90\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.childTrap:+2\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.TestMain:+12\n" +
"main.main:--\n" +
"runtime.main:--\n" +
"runtime.goexit:--"

if !crashmonitor.Supported() { // !go1.23
// See above.
want = strings.ReplaceAll(want, "runtime.sigpanic:--\n", "")
want = strings.ReplaceAll(want, "golang.org/x/telemetry/internal/crashmonitor_test.childTrap:+2\n", "")
}

if got != want {
t.Errorf("got counter name <<%s>>, want <<%s>>", got, want)
}
})
}

func waitForExitFile(t *testing.T, exitFile string) {
Expand Down Expand Up @@ -144,8 +200,8 @@ func TestStart(t *testing.T) {
}
})

// Assert that the crash monitor increments a telemetry
// counter of the correct name when the child process panics.
// Assert that the crash monitor increments a telemetry counter of the
// correct name when the child process panics.
t.Run("panic", func(t *testing.T) {
// Gather a stack trace from executing the panic statement above.
telemetryFile, exitFile, _ := runSelf(t, "start.panic")
Expand All @@ -157,14 +213,38 @@ func TestStart(t *testing.T) {
got := sanitize(counter.DecodeStack(string(data)))
want := "crash/crash\n" +
"runtime.gopanic:--\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.grandchild:=69\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.child:+2\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.grandchildPanic:=79\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.childPanic:+2\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.TestMain.func3:+1\n" +
"runtime.goexit:--"
if got != want {
t.Errorf("got counter name <<%s>>, want <<%s>>", got, want)
}
})

// Assert that the crash monitor increments a telemetry counter of the
// correct name when the child process panics via trap.
t.Run("trap", func(t *testing.T) {
// Gather a stack trace from executing the panic statement above.
telemetryFile, exitFile, _ := runSelf(t, "start.trap")
waitForExitFile(t, exitFile)
data, err := os.ReadFile(telemetryFile)
if err != nil {
t.Fatalf("failed to read file: %v", err)
}
got := sanitize(counter.DecodeStack(string(data)))
want := "crash/crash\n" +
"runtime.gopanic:--\n" +
"runtime.panicmem:--\n" +
"runtime.sigpanic:--\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.grandchildTrap:=90\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.childTrap:+2\n" +
"golang.org/x/telemetry/internal/crashmonitor_test.TestMain.func4:+1\n" +
"runtime.goexit:--"
if got != want {
t.Errorf("got counter name <<%s>>, want <<%s>>", got, want)
}
})
}

// runSelf fork+exec's this test executable using an alternate entry point.
Expand Down

0 comments on commit cc96b6e

Please sign in to comment.