Skip to content

Commit

Permalink
Fix spurious deadline exceeded error
Browse files Browse the repository at this point in the history
stdlib_test.TestConnConcurrency had been flickering on CI deadline /
timeout errors. This was extremely confusing because the test deadline
was set for 2 minutes and the errors would occur much quicker.

The problem only manifested in an extremely specific and timing
sensitive situation.

1. The watchdog timer for deadlocked writes starts the goroutine to
   start the background reader
2. The background reader is stopped
3. The next operation is a read without a preceding write (AFAIK only
   CheckConn does this)
4. The deadline is set to interrupt the read
5. The goroutine from 1 actually starts the background reader
6. The background reader gets an error reading the connection with the
   deadline
7. The deadline is cleared
8. The next read on the connection will get the timeout error
  • Loading branch information
jackc committed Oct 14, 2023
1 parent 45f807f commit 81ddcfd
Showing 1 changed file with 25 additions and 6 deletions.
31 changes: 25 additions & 6 deletions pgconn/pgconn.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ type PgConn struct {
frontend *pgproto3.Frontend
bgReader *bgreader.BGReader
slowWriteTimer *time.Timer
bgReaderStarted chan struct{}

config *Config

Expand Down Expand Up @@ -301,8 +302,14 @@ func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig
pgConn.parameterStatuses = make(map[string]string)
pgConn.status = connStatusConnecting
pgConn.bgReader = bgreader.New(pgConn.conn)
pgConn.slowWriteTimer = time.AfterFunc(time.Duration(math.MaxInt64), pgConn.bgReader.Start)
pgConn.slowWriteTimer = time.AfterFunc(time.Duration(math.MaxInt64),
func() {
pgConn.bgReader.Start()
pgConn.bgReaderStarted <- struct{}{}
},
)
pgConn.slowWriteTimer.Stop()
pgConn.bgReaderStarted = make(chan struct{})
pgConn.frontend = config.BuildFrontend(pgConn.bgReader, pgConn.conn)

startupMsg := pgproto3.StartupMessage{
Expand Down Expand Up @@ -1732,10 +1739,16 @@ func (pgConn *PgConn) enterPotentialWriteReadDeadlock() {

// exitPotentialWriteReadDeadlock must be called after a call to enterPotentialWriteReadDeadlock.
func (pgConn *PgConn) exitPotentialWriteReadDeadlock() {
// The state of the timer is not relevant upon exiting the potential slow write. It may both
// fire (due to a slow write), or not fire (due to a fast write).
_ = pgConn.slowWriteTimer.Stop()
pgConn.bgReader.Stop()
if !pgConn.slowWriteTimer.Stop() {
// The timer starts its function in a separate goroutine. It is necessary to ensure the background reader has
// started before calling Stop. Otherwise, the background reader may not be stopped. That on its own is not a
// serious problem. But what is a serious problem is that the background reader may start at an inopportune time in
// a subsequent query. For example, if a subsequent query was canceled then a deadline may be set on the net.Conn to
// interrupt an in-progress read. After the read is interrupted, but before the deadline is cleared, the background
// reader could start and read a deadline error. Then the next query would receive the an unexpected deadline error.
<-pgConn.bgReaderStarted
pgConn.bgReader.Stop()
}
}

func (pgConn *PgConn) flushWithPotentialWriteReadDeadlock() error {
Expand Down Expand Up @@ -1830,8 +1843,14 @@ func Construct(hc *HijackedConn) (*PgConn, error) {

pgConn.contextWatcher = newContextWatcher(pgConn.conn)
pgConn.bgReader = bgreader.New(pgConn.conn)
pgConn.slowWriteTimer = time.AfterFunc(time.Duration(math.MaxInt64), pgConn.bgReader.Start)
pgConn.slowWriteTimer = time.AfterFunc(time.Duration(math.MaxInt64),
func() {
pgConn.bgReader.Start()
pgConn.bgReaderStarted <- struct{}{}
},
)
pgConn.slowWriteTimer.Stop()
pgConn.bgReaderStarted = make(chan struct{})
pgConn.frontend = hc.Config.BuildFrontend(pgConn.bgReader, pgConn.conn)

return pgConn, nil
Expand Down

0 comments on commit 81ddcfd

Please sign in to comment.