original issue: elastic/beats#33258
long story short: we run auditbeat
as DaemonSet on GKE clusters with slightly different versions, some nodes run docker
, other nodes run containerd
.
it runs with all permissions it needs, journald
already unregistered by an initContainer so auditbeat
can get audit events.
Problem is that some random auditbeat
pods keep outputting this error until we restart them:
ERROR: get status request failed:failed to get audit status reply: no reply received
and if we restart a totally fine auditbeat
pod, it might start outputting that error too.
it doesn't however stop writing audit logs to elasticsearch. we get audit logs from the pods that are outputting the error as much as the other pods.
I traced down the error to this block of code:
|
if len(msgs) == 0 { |
|
return nil, errors.New("no reply received") |
|
} |
Wouldn't it be okay if msgs
was empty? At this point we already got through this without any error:
|
for i := 0; i < 10; i++ { |
|
msgs, err = c.Netlink.Receive(true, parseNetlinkAuditMessage) |
|
if err != nil { |
|
switch { |
|
case errors.Is(err, syscall.EINTR): |
|
continue |
|
case errors.Is(err, syscall.EAGAIN): |
|
time.Sleep(50 * time.Millisecond) |
|
continue |
|
default: |
|
return nil, fmt.Errorf("error receiving audit reply: %w", err) |
|
} |
|
} |
|
break |
|
} |
and func (c *NetlinkClient) Receive()
already got the appropriate error checks here:
|
func (c *NetlinkClient) Receive(nonBlocking bool, p NetlinkParser) ([]syscall.NetlinkMessage, error) { |
|
var flags int |
|
if nonBlocking { |
|
flags |= syscall.MSG_DONTWAIT |
|
} |
|
|
|
// XXX (akroh): A possible enhancement is to use the MSG_PEEK flag to |
|
// check the message size and increase the buffer size to handle it all. |
|
nr, from, err := syscall.Recvfrom(c.fd, c.readBuf, flags) |
|
if err != nil { |
|
// EAGAIN or EWOULDBLOCK will be returned for non-blocking reads where |
|
// the read would normally have blocked. |
|
return nil, err |
|
} |
|
if nr < syscall.NLMSG_HDRLEN { |
|
return nil, fmt.Errorf("not enough bytes (%v) received to form a netlink header", nr) |
|
} |
|
fromNetlink, ok := from.(*syscall.SockaddrNetlink) |
|
if !ok || fromNetlink.Pid != 0 { |
|
// Spoofed packet received on audit netlink socket. |
|
return nil, errors.New("message received was not from the kernel") |
|
} |
|
|
|
buf := c.readBuf[:nr] |
|
|
|
// Dump raw data for inspection purposes. |
|
if c.respWriter != nil { |
|
if _, err = c.respWriter.Write(buf); err != nil { |
|
return nil, err |
|
} |
|
} |
|
|
|
msgs, err := p(buf) |
|
if err != nil { |
|
return nil, fmt.Errorf("failed to parse netlink messages (bytes_received=%v): %w", nr, err) |
|
} |
|
|
|
return msgs, nil |
|
} |
Shouldn't len(msgs) == 0
be reported as a warning instead of an error?