Description
Go version
go version go1.23.1 linux/amd64
Output of go env
in your module/workspace:
GO111MODULE=''
GOARCH='amd64'
GOBIN=''
GOCACHE='/home/lizf/.cache/go-build'
GOENV='/home/lizf/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFLAGS=''
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOINSECURE=''
GOMODCACHE='/home/lizf/hny/go/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='linux'
GOPATH='/home/lizf/hny/go'
GOPRIVATE=''
GOPROXY='https://proxy.golang.org,direct'
GOROOT='/snap/go/current'
GOSUMDB='sum.golang.org'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/snap/go/current/pkg/tool/linux_amd64'
GOVCS=''
GOVERSION='go1.23.1'
GODEBUG=''
GOTELEMETRY='local'
GOTELEMETRYDIR='/home/lizf/.config/go/telemetry'
GCCGO='gccgo'
GOAMD64='v1'
AR='ar'
CC='gcc'
CXX='g++'
CGO_ENABLED='1'
GOMOD='/home/lizf/hny/go/src/github.com/honeycombio/hound/go.mod'
GOWORK=''
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
PKG_CONFIG='pkg-config'
GOGCCFLAGS='-fPIC -m64 -pthread -Wl,--no-gc-sections -fmessage-length=0 -ffile-prefix-map=/tmp/go-build1515630055=/tmp/go-build -gno-record-gcc-switches'
What did you do?
Compiled using PGO, which inlined this (otherwise not normally inlinable) method:
func (r *Readahead) Read(p []byte) (n int, err error) {
for r.out != nil {
got, _ := r.out.Read(p[n:])
n += got
if n == len(p) {
return n, nil
}
// recycle our block
r.returnBuffer <- r.out
// get another block
r.out = <-r.getNextOutput // Crash happens here, when reporting blocking profiles.
}
err = <-r.err
if err == nil {
err = io.EOF
}
return n, err
}
into this method:
// Returns the requested number of bytes from the internal buffer, without
// copying. Results become invalid next time you do anything.
// Best used for small values.
func (r *Readahead) Next(length int) (buf []byte, err error) {
if r.out == nil {
return nil, io.EOF
}
got := r.out.Next(length)
// Common case - we got everything we need
if len(got) == length {
return got, nil
}
// Uncommon case - partial read across a block boundary
if cap(r.nextBuf) < length {
r.nextBuf = make([]byte, 0, length)
}
r.nextBuf = append(r.nextBuf[:0], got...)
n, err := r.Read(r.nextBuf[len(r.nextBuf):length]) // This function inlined.
return r.nextBuf[:len(r.nextBuf)+n], err
}
and for completeness, the struct:
type Readahead struct {
frame *frame
out *bytes.Buffer
getNextOutput chan *bytes.Buffer
returnBuffer chan *bytes.Buffer
err chan error
// scratch space for Next results which span block boundaries.
nextBuf []byte
}
and also our runtime profiling config:
runtime.SetBlockProfileRate(1000000)
runtime.SetMutexProfileFraction(1000)
What did you see happen?
Rarely, when a block longer than a second happens reading from the channel:
fatal error: unexpected signal during runtime execution
[signal SIGSEGV: segmentation violation code=0x1 addr=0x30bc5b31 pc=0x43b1da]
goroutine 1093 gp=0xc004afe700 m=5 mp=0xc0006e8708 [running]:
runtime/mprof.go:592
runtime.chanrecv(0xc00068cd90, 0xc000712788, 0x1)
runtime/chan.go:651 +0x72f fp=0xc000712720 sp=0xc000712620 pc=0x40abaf
github.com/honeycombio/hound/lib/retriever/cstorage/lz4.(*Readahead).Read(0xc00079c780, {0xc00836caa7, 0x1, 0x1})
github.com/honeycombio/hound/lib/retriever/cstorage/lz4.(*Readahead).Next(...)
github.com/honeycombio/hound/lib/retriever/cstorage/compact_column.go:706 +0x251 fp=0xc000712818 sp=0xc0007127a0 pc=0x1bad251
github.com/honeycombio/hound/lib/retriever/cstorage.(*int64CompactReader).read(...)
github.com/honeycombio/hound/lib/retriever/cstorage.(*ColumnReader).readRowsImpl.func2.2.5(_, {{_, _}, _, {_, _}}, _, {0x0, 0x0, 0x0}, ...)
runtime.fpTracebackPartialExpand(...)
runtime/mprof.go:563 +0x35a fp=0xc000712620 sp=0xc000712578 pc=0x43b1da
runtime.chanrecv1(0xc000ba1840?, 0x2?)
github.com/honeycombio/hound/lib/retriever/cstorage/lz4/lz4.go:110 +0x16f fp=0xc0007127a0 sp=0xc000712748 pc=0x1b7322f
github.com/honeycombio/hound/lib/retriever/cstorage/lz4/lz4.go:155
github.com/honeycombio/hound/lib/retriever/cstorage.(*compactFileReader[...]).readRecord(0x1d2f640?, 0x22691a8?)
runtime.throw({0x217ada7?, 0xc000712568?})
runtime/panic.go:1067 +0x48 fp=0xc000712518 sp=0xc0007124e8 pc=0x48af68
runtime.sigpanic()
runtime/signal_unix.go:884 +0x3c9 fp=0xc000712578 sp=0xc000712518 pc=0x48dbc9
runtime.saveblockevent(0x3fbfec, 0x262594, 0x3, 0x2)
runtime.blockevent(...)
runtime/mprof.go:513
runtime/chan.go:489 +0x12 fp=0xc000712748 sp=0xc000712720 pc=0x40a452
github.com/honeycombio/hound/lib/retriever/cstorage/compact_column.go:433
net.(*netFD).Read(...)
crypto/tls.(*atLeastReader).Read(0xc007ddc168, {0xc000eea000?, 0xc000cf83c0?, 0xc000eee431?})
bytes.(*Buffer).ReadFrom(0xc000865eb8, {0x257fa60, 0xc007ddc168})
crypto/tls/conn.go:831
crypto/tls/conn.go:629 +0x49e fp=0xc0029b6b80 sp=0xc0029b67e8 pc=0x6b9e1e
crypto/tls.(*Conn).Read(0xc000865c08, {0xc000c41fc9, 0x3a42, 0x2b5c837?})
crypto/tls/conn.go:1385 +0x150 fp=0xc0029b6bf0 sp=0xc0029b6b80 pc=0x6c33d0
net/http.(*persistConn).Read(0xc0005d8c60, {0xc000c41fc9?, 0x4b4559?, 0x4b4559?})
io.(*LimitedReader).Read(0xc008c0eaf8, {0xc000c41fc9?, 0xc00014c4b0?, 0xc0001c9530?})
github.com/sourcegraph/conc/pool.(*Pool).Wait(0xc008ac8200)
github.com/honeycombio/hound/lib/retriever/retriever.go:1623 +0x1f33 fp=0xc000539528 sp=0xc000538bd8 pc=0x1c20a13
github.com/honeycombio/hound/cmd/retriever-lambda/app.(*App).Invoke(0xc0001af7c0, {0x25a0998, 0xc000869170}, {0xc004b70000, 0x58780, 0x80000})
github.com/aws/aws-lambda-go@v1.47.0/lambda/handler.go:298 +0x43 fp=0xc0005398d0 sp=0xc000539878 pc=0x838123
github.com/aws/aws-lambda-go@v1.47.0/lambda/invoke_loop.go:75 +0x2b2 fp=0xc000539ab8 sp=0xc000539940 pc=0x8386f2
github.com/aws/aws-lambda-go/lambda.startRuntimeAPILoop({0xc000058107, 0xe}, {0x257e140, 0xc0001d4d90})
the problem appears to be at https://github.com/golang/go/blob/go1.23.1/src/runtime/mprof.go#L592 when unwinding the inlined calls on the stack.
We cannot repro this behaviour on arm64, it only repros on amd64, and we do not believe it would repro on go1.22 based on what we are seeing in the changed code from 1.22 to 1.23 with https://go-review.googlesource.com/c/go/+/533258 and 87abb4a both being new additions, but we've already upgraded to 1.23 semantics in our full codebase and don't want to attempt to undo them all.
What did you expect to see?
No crash.
Metadata
Metadata
Assignees
Labels
Type
Projects
Status