runtime: implement GC pacer redesign

This change implements the GC pacer redesign outlined in #44167 and the accompanying design document, behind a GOEXPERIMENT flag that is on by default. In addition to adding the new pacer, this CL also includes code to track and account for stack and globals scan work in the pacer and in the assist credit system. The new pacer also deviates slightly from the document in that it increases the bound on the minimum trigger ratio from 0.6 (scaled by GOGC) to 0.7. The logic behind this change is that the new pacer much more consistently hits the goal (good!) leading to slightly less frequent GC cycles, but _longer_ ones (in this case, bad!). It turns out that the cost of having the GC on hurts throughput significantly (per byte of memory used), though tail latencies can improve by up to 10%! To be conservative, this change moves the value to 0.7 where there is a small improvement to both throughput and latency, given the memory use. Because the new pacer accounts for the two most significant sources of scan work after heap objects, it is now also safer to reduce the minimum heap size without leading to very poor amortization. This change thus decreases the minimum heap size to 512 KiB, which corresponds to the fact that the runtime has around 200 KiB of scannable globals always there, up-front, providing a baseline. Benchmark results: https://perf.golang.org/search?q=upload:20211001.6 tile38's KNearest benchmark shows a memory increase, but throughput (and latency) per byte of memory used is better. gopher-lua showed an increase in both CPU time and memory usage, but subsequent attempts to reproduce this behavior are inconsistent. Sometimes the overall performance is better, sometimes it's worse. This suggests that the benchmark is fairly noisy in a way not captured by the benchmarking framework itself. biogo-igor is the only benchmark to show a significant performance loss. This benchmark exhibits a very high GC rate, with relatively little work to do in each cycle. The idle mark workers are quite active. In the new pacer, mark phases are longer, mark assists are fewer, and some of that time in mark assists has shifted to idle workers. Linux perf indicates that the difference in CPU time can be mostly attributed to write-barrier slow path related calls, which in turn indicates that the write barrier being on for longer is the primary culprit. This also explains the memory increase, as a longer mark phase leads to more memory allocated black, surviving an extra cycle and contributing to the heap goal. For #44167. Change-Id: I8ac7cfef7d593e4a642c9b2be43fb3591a8ec9c4 Reviewed-on: https://go-review.googlesource.com/c/go/+/309869 Trust: Michael Knyszek <mknyszek@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
golang · Nov 4, 2021 · a108b28 · a108b28
1 parent 988efd5
commit a108b28
Show file tree

Hide file tree

Showing 9 changed files with 680 additions and 103 deletions.
diff --git a/src/internal/goexperiment/exp_pacerredesign_off.go b/src/internal/goexperiment/exp_pacerredesign_off.go
diff --git a/src/internal/goexperiment/exp_pacerredesign_on.go b/src/internal/goexperiment/exp_pacerredesign_on.go
diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go
@@ -83,4 +83,10 @@ type Flags struct {
 	// Requires wrappers (to do ABI translation), and reflect (so
 	// reflection calls use registers).
 	RegabiArgs bool
+
+	// PacerRedesign enables the new GC pacer in the runtime.
+	//
+	// Details regarding the new pacer may be found at
+	// https://golang.org/design/44167-gc-pacer-redesign
+	PacerRedesign bool
 }
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
@@ -1290,7 +1290,9 @@ type GCControllerReviseDelta struct {
 func (c *GCController) Revise(d GCControllerReviseDelta) {
 	c.heapLive += uint64(d.HeapLive)
 	c.heapScan += uint64(d.HeapScan)
-	c.scanWork += d.HeapScanWork + d.StackScanWork + d.GlobalsScanWork
+	c.heapScanWork.Add(d.HeapScanWork)
+	c.stackScanWork.Add(d.StackScanWork)
+	c.globalsScanWork.Add(d.GlobalsScanWork)
 	c.revise()
 }
 

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
@@ -1084,6 +1084,8 @@ func gcMarkTermination(nextTriggerRatio float64) {
 		print(" ms cpu, ",
 			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
 			work.heapGoal>>20, " MB goal, ",
+			gcController.stackScan>>20, " MB stacks, ",
+			gcController.globalsScan>>20, " MB globals, ",
 			work.maxprocs, " P")
 		if work.userForced {
 			print(" (forced)")

diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
@@ -8,6 +8,7 @@ package runtime
 
 import (
 	"internal/goarch"
+	"internal/goexperiment"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -151,20 +152,28 @@ var oneptrmask = [...]uint8{1}
 //
 // Preemption must be disabled (because this uses a gcWork).
 //
+// Returns the amount of GC work credit produced by the operation.
+// If flushBgCredit is true, then that credit is also flushed
+// to the background credit pool.
+//
 // nowritebarrier is only advisory here.
 //
 //go:nowritebarrier
-func markroot(gcw *gcWork, i uint32) {
+func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 {
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
+	var workDone int64
+	var workCounter *atomic.Int64
 	switch {
 	case work.baseData <= i && i < work.baseBSS:
+		workCounter = &gcController.globalsScanWork
 		for _, datap := range activeModules() {
-			markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
+			workDone += markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
 		}
 
 	case work.baseBSS <= i && i < work.baseSpans:
+		workCounter = &gcController.globalsScanWork
 		for _, datap := range activeModules() {
-			markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
+			workDone += markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
 		}
 
 	case i == fixedRootFinalizers:
@@ -184,6 +193,7 @@ func markroot(gcw *gcWork, i uint32) {
 
 	default:
 		// the rest is scanning goroutine stacks
+		workCounter = &gcController.stackScanWork
 		var gp *g
 		if work.baseStacks <= i && i < work.baseEnd {
 			// N.B. Atomic read of allglen in gcMarkRootPrepare
@@ -230,7 +240,7 @@ func markroot(gcw *gcWork, i uint32) {
 			if gp.gcscandone {
 				throw("g already scanned")
 			}
-			scanstack(gp, gcw)
+			workDone += scanstack(gp, gcw)
 			gp.gcscandone = true
 			resumeG(stopped)
 
@@ -239,13 +249,24 @@ func markroot(gcw *gcWork, i uint32) {
 			}
 		})
 	}
+	if goexperiment.PacerRedesign {
+		if workCounter != nil && workDone != 0 {
+			workCounter.Add(workDone)
+			if flushBgCredit {
+				gcFlushBgCredit(workDone)
+			}
+		}
+	}
+	return workDone
 }
 
 // markrootBlock scans the shard'th shard of the block of memory [b0,
 // b0+n0), with the given pointer mask.
 //
+// Returns the amount of work done.
+//
 //go:nowritebarrier
-func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
+func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) int64 {
 	if rootBlockBytes%(8*goarch.PtrSize) != 0 {
 		// This is necessary to pick byte offsets in ptrmask0.
 		throw("rootBlockBytes must be a multiple of 8*ptrSize")
@@ -256,7 +277,7 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
 	// These tests are written to avoid any possible overflow.
 	off := uintptr(shard) * rootBlockBytes
 	if off >= n0 {
-		return
+		return 0
 	}
 	b := b0 + off
 	ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*goarch.PtrSize))))
@@ -267,6 +288,7 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
 
 	// Scan this shard.
 	scanblock(b, n, ptrmask, gcw, nil)
+	return int64(n)
 }
 
 // markrootFreeGStacks frees stacks of dead Gs.
@@ -681,6 +703,13 @@ func gcFlushBgCredit(scanWork int64) {
 
 // scanstack scans gp's stack, greying all pointers found on the stack.
 //
+// For goexperiment.PacerRedesign:
+// Returns the amount of scan work performed, but doesn't update
+// gcController.stackScanWork or flush any credit. Any background credit produced
+// by this function should be flushed by its caller. scanstack itself can't
+// safely flush because it may result in trying to wake up a goroutine that
+// was just scanned, resulting in a self-deadlock.
+//
 // scanstack will also shrink the stack if it is safe to do so. If it
 // is not, it schedules a stack shrink for the next synchronous safe
 // point.
@@ -690,7 +719,7 @@ func gcFlushBgCredit(scanWork int64) {
 //
 //go:nowritebarrier
 //go:systemstack
-func scanstack(gp *g, gcw *gcWork) {
+func scanstack(gp *g, gcw *gcWork) int64 {
 	if readgstatus(gp)&_Gscan == 0 {
 		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
 		throw("scanstack - bad status")
@@ -701,7 +730,7 @@ func scanstack(gp *g, gcw *gcWork) {
 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
 		throw("mark - bad status")
 	case _Gdead:
-		return
+		return 0
 	case _Grunning:
 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
 		throw("scanstack: goroutine not stopped")
@@ -713,6 +742,15 @@ func scanstack(gp *g, gcw *gcWork) {
 		throw("can't scan our own stack")
 	}
 
+	// stackSize is the amount of work we'll be reporting.
+	//
+	// We report the total stack size, more than we scan,
+	// because this number needs to line up with gcControllerState's
+	// stackScan and scannableStackSize fields.
+	//
+	// See the documentation on those fields for more information.
+	stackSize := gp.stack.hi - gp.stack.lo
+
 	if isShrinkStackSafe(gp) {
 		// Shrink the stack if not much of it is being used.
 		shrinkstack(gp)
@@ -852,6 +890,7 @@ func scanstack(gp *g, gcw *gcWork) {
 	if state.buf != nil || state.cbuf != nil || state.freeBuf != nil {
 		throw("remaining pointer buffers")
 	}
+	return int64(stackSize)
 }
 
 // Scan a stack frame: local variables and function arguments/results.
@@ -984,7 +1023,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 	flushBgCredit := flags&gcDrainFlushBgCredit != 0
 	idle := flags&gcDrainIdle != 0
 
-	initScanWork := gcw.scanWork
+	initScanWork := gcw.heapScanWork
 
 	// checkWork is the scan work before performing the next
 	// self-preempt check.
@@ -1007,7 +1046,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 			if job >= work.markrootJobs {
 				break
 			}
-			markroot(gcw, job)
+			markroot(gcw, job, flushBgCredit)
 			if check != nil && check() {
 				goto done
 			}
@@ -1046,14 +1085,14 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 		// Flush background scan work credit to the global
 		// account if we've accumulated enough locally so
 		// mutator assists can draw on it.
-		if gcw.scanWork >= gcCreditSlack {
-			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+		if gcw.heapScanWork >= gcCreditSlack {
+			gcController.heapScanWork.Add(gcw.heapScanWork)
 			if flushBgCredit {
-				gcFlushBgCredit(gcw.scanWork - initScanWork)
+				gcFlushBgCredit(gcw.heapScanWork - initScanWork)
 				initScanWork = 0
 			}
-			checkWork -= gcw.scanWork
-			gcw.scanWork = 0
+			checkWork -= gcw.heapScanWork
+			gcw.heapScanWork = 0
 
 			if checkWork <= 0 {
 				checkWork += drainCheckThreshold
@@ -1066,12 +1105,12 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 done:
 	// Flush remaining scan work credit.
-	if gcw.scanWork > 0 {
-		atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+	if gcw.heapScanWork > 0 {
+		gcController.heapScanWork.Add(gcw.heapScanWork)
 		if flushBgCredit {
-			gcFlushBgCredit(gcw.scanWork - initScanWork)
+			gcFlushBgCredit(gcw.heapScanWork - initScanWork)
 		}
-		gcw.scanWork = 0
+		gcw.heapScanWork = 0
 	}
 }
 
@@ -1095,10 +1134,10 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 
 	// There may already be scan work on the gcw, which we don't
 	// want to claim was done by this call.
-	workFlushed := -gcw.scanWork
+	workFlushed := -gcw.heapScanWork
 
 	gp := getg().m.curg
-	for !gp.preempt && workFlushed+gcw.scanWork < scanWork {
+	for !gp.preempt && workFlushed+gcw.heapScanWork < scanWork {
 		// See gcDrain comment.
 		if work.full == 0 {
 			gcw.balance()
@@ -1117,13 +1156,13 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 
 		if b == 0 {
 			// Try to do a root job.
-			//
-			// TODO: Assists should get credit for this
-			// work.
 			if work.markrootNext < work.markrootJobs {
 				job := atomic.Xadd(&work.markrootNext, +1) - 1
 				if job < work.markrootJobs {
-					markroot(gcw, job)
+					work := markroot(gcw, job, false)
+					if goexperiment.PacerRedesign {
+						workFlushed += work
+					}
 					continue
 				}
 			}
@@ -1134,25 +1173,25 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 		scanobject(b, gcw)
 
 		// Flush background scan work credit.
-		if gcw.scanWork >= gcCreditSlack {
-			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
-			workFlushed += gcw.scanWork
-			gcw.scanWork = 0
+		if gcw.heapScanWork >= gcCreditSlack {
+			gcController.heapScanWork.Add(gcw.heapScanWork)
+			workFlushed += gcw.heapScanWork
+			gcw.heapScanWork = 0
 		}
 	}
 
 	// Unlike gcDrain, there's no need to flush remaining work
 	// here because this never flushes to bgScanCredit and
 	// gcw.dispose will flush any remaining work to scanWork.
 
-	return workFlushed + gcw.scanWork
+	return workFlushed + gcw.heapScanWork
 }
 
 // scanblock scans b as scanobject would, but using an explicit
 // pointer bitmap instead of the heap bitmap.
 //
 // This is used to scan non-heap roots, so it does not update
-// gcw.bytesMarked or gcw.scanWork.
+// gcw.bytesMarked or gcw.heapScanWork.
 //
 // If stk != nil, possible stack pointers are also reported to stk.putPtr.
 //go:nowritebarrier
@@ -1282,7 +1321,7 @@ func scanobject(b uintptr, gcw *gcWork) {
 		}
 	}
 	gcw.bytesMarked += uint64(n)
-	gcw.scanWork += int64(i)
+	gcw.heapScanWork += int64(i)
 }
 
 // scanConservative scans block [b, b+n) conservatively, treating any
@@ -1521,7 +1560,19 @@ func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
 
 	gcw := &getg().m.p.ptr().gcw
 	gcw.bytesMarked += uint64(size)
-	gcw.scanWork += int64(scanSize)
+	if !goexperiment.PacerRedesign {
+		// The old pacer counts newly allocated memory toward
+		// heapScanWork because heapScan is continuously updated
+		// throughout the GC cyle with newly allocated memory. However,
+		// these objects are never actually scanned, so we need
+		// to account for them in heapScanWork here, "faking" their work.
+		// Otherwise the pacer will think it's always behind, potentially
+		// by a large margin.
+		//
+		// The new pacer doesn't care about this because it ceases to updated
+		// heapScan once a GC cycle starts, effectively snapshotting it.
+		gcw.heapScanWork += int64(scanSize)
+	}
 }
 
 // gcMarkTinyAllocs greys all active tiny alloc blocks.