Skip to content

Commit 54b48f7

Browse files
authored
balancer/weightedroundrobin: Add recording point for endpoint weight not yet usable and add metrics tests (#7466)
1 parent 7b9e012 commit 54b48f7

25 files changed

+691
-80
lines changed

balancer/weightedroundrobin/balancer.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ func (p *picker) inc() uint32 {
415415
}
416416

417417
func (p *picker) regenerateScheduler() {
418-
s := p.newScheduler()
418+
s := p.newScheduler(true)
419419
atomic.StorePointer(&p.scheduler, unsafe.Pointer(&s))
420420
}
421421

@@ -558,14 +558,17 @@ func (w *weightedSubConn) updateConnectivityState(cs connectivity.State) connect
558558
w.SubConn.Connect()
559559
case connectivity.Ready:
560560
// If we transition back to READY state, reset nonEmptySince so that we
561-
// apply the blackout period after we start receiving load data. Note
562-
// that we cannot guarantee that we will never receive lingering
563-
// callbacks for backend metric reports from the previous connection
564-
// after the new connection has been established, but they should be
565-
// masked by new backend metric reports from the new connection by the
566-
// time the blackout period ends.
561+
// apply the blackout period after we start receiving load data. Also
562+
// reset lastUpdated to trigger endpoint weight not yet usable in the
563+
// case endpoint gets asked what weight it is before receiving a new
564+
// load report. Note that we cannot guarantee that we will never receive
565+
// lingering callbacks for backend metric reports from the previous
566+
// connection after the new connection has been established, but they
567+
// should be masked by new backend metric reports from the new
568+
// connection by the time the blackout period ends.
567569
w.mu.Lock()
568570
w.nonEmptySince = time.Time{}
571+
w.lastUpdated = time.Time{}
569572
w.mu.Unlock()
570573
case connectivity.Shutdown:
571574
if w.stopORCAListener != nil {
@@ -592,7 +595,7 @@ func (w *weightedSubConn) updateConnectivityState(cs connectivity.State) connect
592595
// account the parameters. Returns 0 for blacked out or expired data, which
593596
// will cause the backend weight to be treated as the mean of the weights of the
594597
// other backends. If forScheduler is set to true, this function will emit
595-
// metrics through the mtrics registry.
598+
// metrics through the metrics registry.
596599
func (w *weightedSubConn) weight(now time.Time, weightExpirationPeriod, blackoutPeriod time.Duration, recordMetrics bool) (weight float64) {
597600
w.mu.Lock()
598601
defer w.mu.Unlock()
@@ -603,6 +606,13 @@ func (w *weightedSubConn) weight(now time.Time, weightExpirationPeriod, blackout
603606
}()
604607
}
605608

609+
// The SubConn has not received a load report (i.e. just turned READY with
610+
// no load report).
611+
if w.lastUpdated == (time.Time{}) {
612+
endpointWeightNotYetUsableMetric.Record(w.metricsRecorder, 1, w.target, w.locality)
613+
return 0
614+
}
615+
606616
// If the most recent update was longer ago than the expiration period,
607617
// reset nonEmptySince so that we apply the blackout period again if we
608618
// start getting data again in the future, and return 0.

balancer/weightedroundrobin/balancer_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"google.golang.org/grpc/internal/grpctest"
3333
"google.golang.org/grpc/internal/stubserver"
3434
"google.golang.org/grpc/internal/testutils/roundrobin"
35+
"google.golang.org/grpc/internal/testutils/stats"
3536
"google.golang.org/grpc/orca"
3637
"google.golang.org/grpc/peer"
3738
"google.golang.org/grpc/resolver"
@@ -81,6 +82,14 @@ var (
8182
WeightUpdatePeriod: stringp(".050s"),
8283
ErrorUtilizationPenalty: float64p(0),
8384
}
85+
testMetricsConfig = iwrr.LBConfig{
86+
EnableOOBLoadReport: boolp(false),
87+
OOBReportingPeriod: stringp("0.005s"),
88+
BlackoutPeriod: stringp("0s"),
89+
WeightExpirationPeriod: stringp("60s"),
90+
WeightUpdatePeriod: stringp(".050s"),
91+
ErrorUtilizationPenalty: float64p(0),
92+
}
8493
)
8594

8695
type testServer struct {
@@ -196,6 +205,43 @@ func (s) TestBalancer_OneAddress(t *testing.T) {
196205
}
197206
}
198207

208+
// TestWRRMetricsBasic tests metrics emitted from the WRR balancer. It
209+
// configures a weighted round robin balancer as the top level balancer of a
210+
// ClientConn, and configures a fake stats handler on the ClientConn to receive
211+
// metrics. It verifies stats emitted from the Weighted Round Robin Balancer on
212+
// balancer startup case which triggers the first picker and scheduler update
213+
// before any load reports are received.
214+
//
215+
// Note that this test and others, metrics emission asssertions are a snapshot
216+
// of the most recently emitted metrics. This is due to the nondeterminism of
217+
// scheduler updates with respect to test bodies, so the assertions made are
218+
// from the most recently synced state of the system (picker/scheduler) from the
219+
// test body.
220+
func (s) TestWRRMetricsBasic(t *testing.T) {
221+
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
222+
defer cancel()
223+
224+
srv := startServer(t, reportCall)
225+
sc := svcConfig(t, testMetricsConfig)
226+
227+
mr := stats.NewTestMetricsRecorder(t)
228+
if err := srv.StartClient(grpc.WithDefaultServiceConfig(sc), grpc.WithStatsHandler(mr)); err != nil {
229+
t.Fatalf("Error starting client: %v", err)
230+
}
231+
srv.callMetrics.SetQPS(float64(1))
232+
233+
if _, err := srv.Client.EmptyCall(ctx, &testpb.Empty{}); err != nil {
234+
t.Fatalf("Error from EmptyCall: %v", err)
235+
}
236+
237+
mr.AssertDataForMetric("grpc.lb.wrr.rr_fallback", 1) // Falls back because only one SubConn.
238+
mr.AssertDataForMetric("grpc.lb.wrr.endpoint_weight_stale", 0) // The endpoint weight has not expired so this is 0 (never emitted).
239+
mr.AssertDataForMetric("grpc.lb.wrr.endpoint_weight_not_yet_usable", 1)
240+
// Unusable, so no endpoint weight. Due to only one SubConn, this will never
241+
// update the weight. Thus, this will stay 0.
242+
mr.AssertDataForMetric("grpc.lb.wrr.endpoint_weights", 0)
243+
}
244+
199245
// Tests two addresses with ORCA reporting disabled (should fall back to pure
200246
// RR).
201247
func (s) TestBalancer_TwoAddresses_ReportingDisabled(t *testing.T) {
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
/*
2+
*
3+
* Copyright 2024 gRPC authors.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
package weightedroundrobin
20+
21+
import (
22+
"testing"
23+
"time"
24+
25+
"google.golang.org/grpc/internal/grpctest"
26+
iserviceconfig "google.golang.org/grpc/internal/serviceconfig"
27+
"google.golang.org/grpc/internal/testutils/stats"
28+
)
29+
30+
type s struct {
31+
grpctest.Tester
32+
}
33+
34+
func Test(t *testing.T) {
35+
grpctest.RunSubTests(t, s{})
36+
}
37+
38+
// TestWRR_Metrics_SubConnWeight tests different scenarios for the weight call
39+
// on a weighted SubConn, and expects certain metrics for each of these
40+
// scenarios.
41+
func (s) TestWRR_Metrics_SubConnWeight(t *testing.T) {
42+
tests := []struct {
43+
name string
44+
weightExpirationPeriod time.Duration
45+
blackoutPeriod time.Duration
46+
lastUpdated time.Time
47+
nonEmpty time.Time
48+
nowTime time.Time
49+
endpointWeightStaleWant float64
50+
endpointWeightNotYetUsableWant float64
51+
endpointWeightWant float64
52+
}{
53+
// The weighted SubConn's lastUpdated field hasn't been set, so this
54+
// SubConn's weight is not yet usable. Thus, should emit that endpoint
55+
// weight is not yet usable, and 0 for weight.
56+
{
57+
name: "no weight set",
58+
weightExpirationPeriod: time.Second,
59+
blackoutPeriod: time.Second,
60+
nowTime: time.Now(),
61+
endpointWeightStaleWant: 0,
62+
endpointWeightNotYetUsableWant: 1,
63+
endpointWeightWant: 0,
64+
},
65+
{
66+
name: "weight expiration",
67+
lastUpdated: time.Now(),
68+
weightExpirationPeriod: 2 * time.Second,
69+
blackoutPeriod: time.Second,
70+
nowTime: time.Now().Add(100 * time.Second),
71+
endpointWeightStaleWant: 1,
72+
endpointWeightNotYetUsableWant: 0,
73+
endpointWeightWant: 0,
74+
},
75+
{
76+
name: "in blackout period",
77+
lastUpdated: time.Now(),
78+
weightExpirationPeriod: time.Minute,
79+
blackoutPeriod: 10 * time.Second,
80+
nowTime: time.Now(),
81+
endpointWeightStaleWant: 0,
82+
endpointWeightNotYetUsableWant: 1,
83+
endpointWeightWant: 0,
84+
},
85+
{
86+
name: "normal weight",
87+
lastUpdated: time.Now(),
88+
nonEmpty: time.Now(),
89+
weightExpirationPeriod: time.Minute,
90+
blackoutPeriod: time.Second,
91+
nowTime: time.Now().Add(10 * time.Second),
92+
endpointWeightStaleWant: 0,
93+
endpointWeightNotYetUsableWant: 0,
94+
endpointWeightWant: 3,
95+
},
96+
{
97+
name: "weight expiration takes precdedence over blackout",
98+
lastUpdated: time.Now(),
99+
nonEmpty: time.Now(),
100+
weightExpirationPeriod: time.Second,
101+
blackoutPeriod: time.Minute,
102+
nowTime: time.Now().Add(10 * time.Second),
103+
endpointWeightStaleWant: 1,
104+
endpointWeightNotYetUsableWant: 0,
105+
endpointWeightWant: 0,
106+
},
107+
}
108+
109+
for _, test := range tests {
110+
t.Run(test.name, func(t *testing.T) {
111+
tmr := stats.NewTestMetricsRecorder(t)
112+
wsc := &weightedSubConn{
113+
metricsRecorder: tmr,
114+
weightVal: 3,
115+
lastUpdated: test.lastUpdated,
116+
nonEmptySince: test.nonEmpty,
117+
}
118+
wsc.weight(test.nowTime, test.weightExpirationPeriod, test.blackoutPeriod, true)
119+
120+
tmr.AssertDataForMetric("grpc.lb.wrr.endpoint_weight_stale", test.endpointWeightStaleWant)
121+
tmr.AssertDataForMetric("grpc.lb.wrr.endpoint_weight_not_yet_usable", test.endpointWeightNotYetUsableWant)
122+
tmr.AssertDataForMetric("grpc.lb.wrr.endpoint_weights", test.endpointWeightWant)
123+
})
124+
}
125+
126+
}
127+
128+
// TestWRR_Metrics_Scheduler_RR_Fallback tests the round robin fallback metric
129+
// for scheduler updates. It tests the case with one SubConn, and two SubConns
130+
// with no weights. Both of these should emit a count metric for round robin
131+
// fallback.
132+
func (s) TestWRR_Metrics_Scheduler_RR_Fallback(t *testing.T) {
133+
tmr := stats.NewTestMetricsRecorder(t)
134+
wsc := &weightedSubConn{
135+
metricsRecorder: tmr,
136+
weightVal: 0,
137+
}
138+
139+
p := &picker{
140+
cfg: &lbConfig{
141+
BlackoutPeriod: iserviceconfig.Duration(10 * time.Second),
142+
WeightExpirationPeriod: iserviceconfig.Duration(3 * time.Minute),
143+
},
144+
subConns: []*weightedSubConn{wsc},
145+
metricsRecorder: tmr,
146+
}
147+
// There is only one SubConn, so no matter if the SubConn has a weight or
148+
// not will fallback to round robin.
149+
p.regenerateScheduler()
150+
tmr.AssertDataForMetric("grpc.lb.wrr.rr_fallback", 1)
151+
tmr.ClearMetrics()
152+
153+
// With two SubConns, if neither of them have weights, it will also fallback
154+
// to round robin.
155+
wsc2 := &weightedSubConn{
156+
target: "target",
157+
metricsRecorder: tmr,
158+
weightVal: 0,
159+
}
160+
p.subConns = append(p.subConns, wsc2)
161+
p.regenerateScheduler()
162+
tmr.AssertDataForMetric("grpc.lb.wrr.rr_fallback", 1)
163+
}

balancer/weightedroundrobin/scheduler.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,16 @@ type scheduler interface {
3131
// len(scWeights)-1 are zero or there is only a single subconn, otherwise it
3232
// will return an Earliest Deadline First (EDF) scheduler implementation that
3333
// selects the subchannels according to their weights.
34-
func (p *picker) newScheduler() scheduler {
35-
scWeights := p.scWeights(true)
34+
func (p *picker) newScheduler(recordMetrics bool) scheduler {
35+
scWeights := p.scWeights(recordMetrics)
3636
n := len(scWeights)
3737
if n == 0 {
3838
return nil
3939
}
4040
if n == 1 {
41-
rrFallbackMetric.Record(p.metricsRecorder, 1, p.target, p.locality)
41+
if recordMetrics {
42+
rrFallbackMetric.Record(p.metricsRecorder, 1, p.target, p.locality)
43+
}
4244
return &rrScheduler{numSCs: 1, inc: p.inc}
4345
}
4446
sum := float64(0)
@@ -55,7 +57,9 @@ func (p *picker) newScheduler() scheduler {
5557
}
5658

5759
if numZero >= n-1 {
58-
rrFallbackMetric.Record(p.metricsRecorder, 1, p.target, p.locality)
60+
if recordMetrics {
61+
rrFallbackMetric.Record(p.metricsRecorder, 1, p.target, p.locality)
62+
}
5963
return &rrScheduler{numSCs: uint32(n), inc: p.inc}
6064
}
6165
unscaledMean := sum / float64(n-numZero)

internal/stats/metrics_recorder_list_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,8 @@ func (s) TestMetricsRecorderList(t *testing.T) {
144144

145145
// Create two stats.Handlers which also implement MetricsRecorder, configure
146146
// one as a global dial option and one as a local dial option.
147-
mr1 := stats.NewTestMetricsRecorder(t, []string{})
148-
mr2 := stats.NewTestMetricsRecorder(t, []string{})
147+
mr1 := stats.NewTestMetricsRecorder(t)
148+
mr2 := stats.NewTestMetricsRecorder(t)
149149

150150
defer internal.ClearGlobalDialOptions()
151151
internal.AddGlobalDialOptions.(func(opt ...grpc.DialOption))(grpc.WithStatsHandler(mr1))

0 commit comments

Comments
 (0)