cmd/compile: wire up math/bits.TrailingZeros intrinsics for loong64

The runtime malloc implementation makes use of these, among others. Some generic strength reduction rules for Ctz ops have also been added, though only enabled for loong64 for now. This is necessary to make the optimization profitable at all, as the LA464 architecture apparently handles the `TrailingZeros64(x) < 64` part in runtime.nextFreeFast very badly if the compiled branch isn't a simple BEQZ any more (that used to be the case before, when the compiler is able to peek into the pure Go implementation of TrailingZeros). Without the generic rules this change is going to be a big perf hit (as bad as 7~10% in select go1 benchmark cases). The generic changes are benchmarked on linux/amd64 (Threadripper 3990X) and darwin/arm64 (Apple M1 Pro) too, but results are either mixed (amd64) or even net loss (arm64). So, for now those rules are guarded with a predicate that only enables them for loong64. Micro-benchmark results on Loongson 3A5000: goos: linux goarch: loong64 pkg: math/bits │ before │ after │ │ sec/op │ sec/op vs base │ TrailingZeros 2.758n ± 0% 1.004n ± 0% -63.60% (p=0.000 n=10) TrailingZeros8 1.508n ± 0% 1.219n ± 0% -19.20% (p=0.000 n=10) TrailingZeros16 3.526n ± 0% 1.437n ± 0% -59.25% (p=0.000 n=10) TrailingZeros32 3.161n ± 0% 1.004n ± 0% -68.23% (p=0.000 n=10) TrailingZeros64 2.759n ± 0% 1.003n ± 0% -63.65% (p=0.000 n=10) geomean 2.638n 1.121n -57.51% Go1 benchmark results on the same machine: goos: linux goarch: loong64 pkg: test/bench/go1 │ CL 479496 v8 │ this CL │ │ sec/op │ sec/op vs base │ BinaryTree17 14.10 ± 1% 13.64 ± 1% -3.28% (p=0.000 n=10) Fannkuch11 3.421 ± 0% 3.421 ± 0% ~ (p=0.075 n=10) FmtFprintfEmpty 94.78n ± 0% 94.50n ± 0% -0.30% (p=0.000 n=10) FmtFprintfString 155.0n ± 0% 154.1n ± 1% ~ (p=1.000 n=10) FmtFprintfInt 157.2n ± 0% 155.2n ± 1% -1.27% (p=0.000 n=10) FmtFprintfIntInt 242.1n ± 0% 238.0n ± 1% -1.73% (p=0.000 n=10) FmtFprintfPrefixedInt 337.6n ± 0% 334.6n ± 0% -0.89% (p=0.000 n=10) FmtFprintfFloat 399.0n ± 0% 396.4n ± 0% -0.65% (p=0.000 n=10) FmtManyArgs 959.8n ± 0% 923.4n ± 0% -3.79% (p=0.000 n=10) GobDecode 15.63m ± 3% 15.17m ± 1% -2.90% (p=0.001 n=10) GobEncode 18.43m ± 3% 17.62m ± 0% -4.38% (p=0.000 n=10) Gzip 405.1m ± 0% 405.4m ± 0% +0.06% (p=0.035 n=10) Gunzip 86.84m ± 0% 87.20m ± 0% +0.41% (p=0.000 n=10) HTTPClientServer 88.47µ ± 0% 86.92µ ± 1% -1.75% (p=0.000 n=10) JSONEncode 18.84m ± 0% 18.66m ± 0% -0.95% (p=0.000 n=10) JSONDecode 79.35m ± 0% 75.77m ± 1% -4.51% (p=0.000 n=10) Mandelbrot200 7.215m ± 0% 7.215m ± 0% ~ (p=0.315 n=10) GoParse 7.591m ± 1% 7.407m ± 1% -2.43% (p=0.000 n=10) RegexpMatchEasy0_32 133.8n ± 0% 134.3n ± 0% +0.37% (p=0.000 n=10) RegexpMatchEasy0_1K 1.540µ ± 0% 1.544µ ± 0% +0.26% (p=0.000 n=10) RegexpMatchEasy1_32 164.1n ± 0% 165.4n ± 0% +0.79% (p=0.000 n=10) RegexpMatchEasy1_1K 1.626µ ± 0% 1.629µ ± 0% +0.18% (p=0.000 n=10) RegexpMatchMedium_32 1.403µ ± 0% 1.413µ ± 0% +0.71% (p=0.000 n=10) RegexpMatchMedium_1K 41.22µ ± 0% 41.59µ ± 0% +0.90% (p=0.000 n=10) RegexpMatchHard_32 2.071µ ± 0% 2.060µ ± 0% -0.53% (p=0.000 n=10) RegexpMatchHard_1K 61.05µ ± 0% 61.30µ ± 0% +0.41% (p=0.001 n=10) Revcomp 1.351 ± 0% 1.357 ± 0% +0.42% (p=0.000 n=10) Template 117.3m ± 1% 110.6m ± 2% -5.71% (p=0.000 n=10) TimeParse 411.9n ± 0% 411.7n ± 0% ~ (p=0.117 n=10) TimeFormat 514.2n ± 0% 499.9n ± 0% -2.77% (p=0.000 n=10) geomean 104.2µ 103.0µ -1.15% │ CL 479496 v8 │ this CL │ │ B/s │ B/s vs base │ GobDecode 46.84Mi ± 3% 48.24Mi ± 1% +2.98% (p=0.001 n=10) GobEncode 39.72Mi ± 4% 41.53Mi ± 0% +4.57% (p=0.000 n=10) Gzip 45.68Mi ± 0% 45.65Mi ± 0% -0.05% (p=0.029 n=10) Gunzip 213.1Mi ± 0% 212.2Mi ± 0% -0.41% (p=0.000 n=10) JSONEncode 98.23Mi ± 0% 99.18Mi ± 0% +0.97% (p=0.000 n=10) JSONDecode 23.32Mi ± 0% 24.42Mi ± 1% +4.72% (p=0.000 n=10) GoParse 7.277Mi ± 1% 7.458Mi ± 1% +2.49% (p=0.000 n=10) RegexpMatchEasy0_32 228.1Mi ± 0% 227.3Mi ± 0% -0.36% (p=0.000 n=10) RegexpMatchEasy0_1K 634.2Mi ± 0% 632.5Mi ± 0% -0.27% (p=0.000 n=10) RegexpMatchEasy1_32 186.0Mi ± 0% 184.5Mi ± 0% -0.79% (p=0.000 n=10) RegexpMatchEasy1_1K 600.4Mi ± 0% 599.4Mi ± 0% -0.17% (p=0.000 n=10) RegexpMatchMedium_32 21.75Mi ± 0% 21.60Mi ± 0% -0.70% (p=0.000 n=10) RegexpMatchMedium_1K 23.69Mi ± 0% 23.48Mi ± 0% -0.89% (p=0.000 n=10) RegexpMatchHard_32 14.73Mi ± 0% 14.81Mi ± 0% +0.52% (p=0.000 n=10) RegexpMatchHard_1K 15.99Mi ± 0% 15.93Mi ± 0% -0.42% (p=0.000 n=10) Revcomp 179.4Mi ± 0% 178.6Mi ± 0% -0.42% (p=0.000 n=10) Template 15.78Mi ± 1% 16.73Mi ± 2% +6.04% (p=0.000 n=10) geomean 59.97Mi 60.58Mi +1.02% The change should be a net win, as all it does is to pattern-match and replace Ctz ops into respective native instructions, so any performance regression is likely also micro-architecture related, like observed in CL 479496's results. (Indeed, some of the more drastic improvements may well also be coincidental, but the point is that there is at least a small amount of deterministic improvements anyway.) Updates golang#59120 Change-Id: I6c90f727eb00e0add2a5f8575ac045b9e288af54
xen0n · Apr 11, 2023 · 5109221 · 5109221
1 parent d14a9a4
commit 5109221
Show file tree

Hide file tree

Showing 10 changed files with 2,665 additions and 379 deletions.
diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go
@@ -324,6 +324,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		ssa.OpLOONG64MOVDF,
 		ssa.OpLOONG64NEGF,
 		ssa.OpLOONG64NEGD,
+		ssa.OpLOONG64CTZW,
+		ssa.OpLOONG64CTZV,
 		ssa.OpLOONG64SQRTD,
 		ssa.OpLOONG64SQRTF:
 		p := s.Prog(v.Op.Asm())

diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
@@ -129,6 +129,9 @@
 
 (Com(64|32|16|8) x) => (NOR (MOVVconst [0]) x)
 
+(Ctz(32|64)NonZero ...) => (Ctz(32|64) ...)
+(Ctz(32|64) ...) => (CTZ(W|V) ...)
+
 (Sqrt ...) => (SQRTD ...)
 (Sqrt32 ...) => (SQRTF ...)
 

diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
@@ -192,6 +192,8 @@ func init() {
 		{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"},   // -arg0, float64
 		{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
 		{name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
+		{name: "CTZW", argLength: 1, reg: gp11, asm: "CTZW"},   // Count trailing (low order) zeroes (returns 0-32)
+		{name: "CTZV", argLength: 1, reg: gp11, asm: "CTZV"},   // Count trailing (low order) zeroes (returns 0-64)
 
 		{name: "MASKEQZ", argLength: 2, reg: gp21, asm: "MASKEQZ"}, // returns 0 if arg1 == 0, otherwise returns arg0
 		{name: "MASKNEZ", argLength: 2, reg: gp21, asm: "MASKNEZ"}, // returns 0 if arg1 != 0, otherwise returns arg0

diff --git a/src/cmd/compile/internal/ssa/_gen/generic.rules b/src/cmd/compile/internal/ssa/_gen/generic.rules
@@ -2703,3 +2703,40 @@
 (RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 4 && d.Type.Size() == 4 => (RotateLeft(64|32|16|8) x (Add32 <c.Type> c d))
 (RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 2 && d.Type.Size() == 2 => (RotateLeft(64|32|16|8) x (Add16 <c.Type> c d))
 (RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 1 && d.Type.Size() == 1 => (RotateLeft(64|32|16|8) x (Add8  <c.Type> c d))
+
+// Ctz simplifications.
+// CtzNN(x) == NN => x == 0
+(Eq(64|32|16|8) (Const(64|32|16|8) <t> [64]) (Ctz64 x)) && shouldStrengthReduceCtz(config) && config.PtrSize == 8 => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Eq(64|32|16|8) (Const(64|32|16|8) <t> [32]) (Ctz32 x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Eq(64|32|16|8) (Const(64|32|16|8) <t> [16]) (Ctz16 x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Eq(64|32|16|8) (Const(64|32|16|8) <t> [8])  (Ctz8  x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+
+// CtzNN(x) != NN => x != 0
+(Neq(64|32|16|8) (Const(64|32|16|8) <t> [64]) (Ctz64 x)) && shouldStrengthReduceCtz(config) && config.PtrSize == 8 => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Neq(64|32|16|8) (Const(64|32|16|8) <t> [32]) (Ctz32 x)) && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Neq(64|32|16|8) (Const(64|32|16|8) <t> [16]) (Ctz16 x)) && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Neq(64|32|16|8) (Const(64|32|16|8) <t> [8])  (Ctz8  x)) && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+
+// CtzNN(x) < NN => x != 0
+(Less(64|32|16|8) (Ctz64 x) (Const(64|32|16|8) <t> [64])) && shouldStrengthReduceCtz(config) && config.PtrSize == 8 => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Less(64|32|16|8) (Ctz32 x) (Const(64|32|16|8) <t> [32])) && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Less(64|32|16|8) (Ctz16 x) (Const(64|32|16|8) <t> [16])) && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Less(64|32|16|8) (Ctz8  x) (Const(64|32|16|8) <t> [8]))  && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+
+// CtzNN(x) >= NN => x == 0
+(Leq(64|32|16|8)  (Const(64|32|16|8) <t> [64]) (Ctz64 x)) && shouldStrengthReduceCtz(config) && config.PtrSize == 8 => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Leq(64|32|16|8)  (Const(64|32|16|8) <t> [32]) (Ctz32 x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Leq(64|32|16|8)  (Const(64|32|16|8) <t> [16]) (Ctz16 x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Leq(64|32|16|8)  (Const(64|32|16|8) <t> [8])  (Ctz8  x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+
+// CtzNN(x) <= NN - 1 => x != 0
+(Leq(64|32|16|8) (Ctz64 x) (Const(64|32|16|8) <t> [63])) && shouldStrengthReduceCtz(config) && config.PtrSize == 8 => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Leq(64|32|16|8) (Ctz32 x) (Const(64|32|16|8) <t> [31])) && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Leq(64|32|16|8) (Ctz16 x) (Const(64|32|16|8) <t> [15])) && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Leq(64|32|16|8) (Ctz8  x) (Const(64|32|16|8) <t> [7]))  && shouldStrengthReduceCtz(config) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+
+// CtzNN(x) > NN - 1 => x == 0
+(Less(64|32|16|8) (Const(64|32|16|8) <t> [63]) (Ctz64 x)) && shouldStrengthReduceCtz(config) && config.PtrSize == 8 => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Less(64|32|16|8) (Const(64|32|16|8) <t> [31]) (Ctz32 x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Less(64|32|16|8) (Const(64|32|16|8) <t> [15]) (Ctz16 x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Less(64|32|16|8) (Const(64|32|16|8) <t> [7])  (Ctz8  x)) && shouldStrengthReduceCtz(config) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
@@ -2063,3 +2063,14 @@ func isARM64addcon(v int64) bool {
 	}
 	return v <= 0xFFF
 }
+
+// shouldStrengthReduceCtz reports whether strength-reduction of Ctz ops is
+// actually profitable and should be done in this case.
+func shouldStrengthReduceCtz(c *Config) bool {
+	switch c.arch {
+	case "loong64":
+		return true
+	default:
+		return false
+	}
+}
diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go