Skip to content

Commit 2dfbe1f

Browse files
committed
Distributor profiling (cortexproject#296)
* Update weaveworks/common * Tag images as quay.io/weaveworks, to make dev/test easier Also, update circle.yaml for new image names * Refactor the distributor http code, add more tracing * Optimise Cortex Protobufs - Move Prometheus remote protos into cortex so we can own them - Compile with gogoprotobuf, to get generated marshall functions - Remove pointer from protos - Remove intermediate (prometheus) representation in distributor write path - Add vendored gogoproto code - Treat label names and values as custom byte arrays, to prevent a copy
1 parent f2f455c commit 2dfbe1f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+28978
-300
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
# Boiler plate for bulding Docker containers.
55
# All this must go at top of file I'm afraid.
6-
IMAGE_PREFIX := weaveworks/cortex-
6+
IMAGE_PREFIX := quay.io/weaveworks/cortex-
77
IMAGE_TAG := $(shell ./tools/image-tag)
88
UPTODATE := .uptodate
99

@@ -78,7 +78,7 @@ $(EXES): build/$(UPTODATE)
7878
$(NETGO_CHECK)
7979

8080
%.pb.go: build/$(UPTODATE)
81-
protoc -I ./vendor:./$(@D) --go_out=plugins=grpc:./$(@D) ./$(patsubst %.pb.go,%.proto,$@)
81+
protoc -I ./vendor:./$(@D) --gogoslick_out=plugins=grpc:./$(@D) ./$(patsubst %.pb.go,%.proto,$@)
8282

8383
lint: build/$(UPTODATE)
8484
./tools/lint -notestpackage -ignorespelling queriers -ignorespelling Queriers .

build/Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ RUN go get -tags netgo \
1212
github.com/client9/misspell/cmd/misspell \
1313
github.com/jteeuwen/go-bindata/go-bindata \
1414
github.com/golang/protobuf/protoc-gen-go \
15+
github.com/gogo/protobuf/protoc-gen-gogoslick \
16+
github.com/gogo/protobuf/gogoproto \
1517
gopkg.in/mvdan/sh.v1/cmd/shfmt && \
1618
rm -rf /go/pkg /go/src
1719
RUN mkdir protoc && \

circle.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ dependencies:
1010
override:
1111
- |
1212
cd build && \
13-
../tools/rebuild-image weaveworks/cortex-build . build.sh Dockerfile && \
13+
../tools/rebuild-image quay.io/weaveworks/cortex-build . build.sh Dockerfile && \
1414
touch .uptodate
1515
1616
test:

cmd/ingester/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func main() {
4949

5050
// Deferring a func to make ordering obvious
5151
defer func() {
52-
registration.ChangeState(ring.IngesterState_LEAVING)
52+
registration.ChangeState(ring.LEAVING)
5353
ingester.Stop()
5454
registration.Unregister()
5555
server.Stop()

cortex.proto

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
syntax = "proto3";
22

3-
import "github.com/prometheus/prometheus/storage/remote/remote.proto";
4-
53
package cortex;
64

5+
import "github.com/gogo/protobuf/gogoproto/gogo.proto";
6+
7+
option (gogoproto.marshaler_all) = true;
8+
option (gogoproto.unmarshaler_all) = true;
9+
710
service Ingester {
8-
rpc Push(remote.WriteRequest) returns (WriteResponse) {};
11+
rpc Push(WriteRequest) returns (WriteResponse) {};
912
rpc Query(QueryRequest) returns (QueryResponse) {};
1013
rpc LabelValues(LabelValuesRequest) returns (LabelValuesResponse) {};
1114
rpc UserStats(UserStatsRequest) returns (UserStatsResponse) {};
1215
rpc MetricsForLabelMatchers(MetricsForLabelMatchersRequest) returns (MetricsForLabelMatchersResponse) {};
1316
}
1417

18+
message WriteRequest {
19+
repeated TimeSeries timeseries = 1 [(gogoproto.nullable) = false];
20+
}
21+
1522
message WriteResponse {}
1623

1724
message QueryRequest {
@@ -21,7 +28,7 @@ message QueryRequest {
2128
}
2229

2330
message QueryResponse {
24-
repeated remote.TimeSeries timeseries = 1;
31+
repeated TimeSeries timeseries = 1 [(gogoproto.nullable) = false];
2532
}
2633

2734
message LabelValuesRequest {
@@ -49,12 +56,29 @@ message MetricsForLabelMatchersResponse {
4956
repeated Metric metric = 1;
5057
}
5158

59+
60+
message TimeSeries {
61+
repeated LabelPair labels = 1 [(gogoproto.nullable) = false];
62+
// Sorted by time, oldest sample first.
63+
repeated Sample samples = 2 [(gogoproto.nullable) = false];
64+
}
65+
66+
message LabelPair {
67+
bytes name = 1 [(gogoproto.customtype) = "github.com/weaveworks/cortex/util/wire.Bytes", (gogoproto.nullable) = false];
68+
bytes value = 2 [(gogoproto.customtype) = "github.com/weaveworks/cortex/util/wire.Bytes", (gogoproto.nullable) = false];
69+
}
70+
71+
message Sample {
72+
double value = 1;
73+
int64 timestamp_ms = 2;
74+
}
75+
5276
message LabelMatchers {
5377
repeated LabelMatcher matchers = 1;
5478
}
5579

5680
message Metric {
57-
repeated remote.LabelPair labels = 1;
81+
repeated LabelPair labels = 1 [(gogoproto.nullable) = false];
5882
}
5983

6084
enum MatchType {

distributor/distributor.go

Lines changed: 58 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"github.com/prometheus/common/log"
2121
"github.com/prometheus/common/model"
2222
"github.com/prometheus/prometheus/storage/metric"
23-
"github.com/prometheus/prometheus/storage/remote"
2423

2524
"github.com/weaveworks/common/instrument"
2625
"github.com/weaveworks/common/middleware"
@@ -38,6 +37,7 @@ var (
3837
"The current number of ingester clients.",
3938
nil, nil,
4039
)
40+
labelNameBytes = []byte(model.MetricNameLabel)
4141
)
4242

4343
// Distributor is a storage.SampleAppender and a cortex.Querier which
@@ -242,20 +242,25 @@ func (d *Distributor) getClientFor(ingester *ring.IngesterDesc) (cortex.Ingester
242242
return client, nil
243243
}
244244

245-
func tokenForMetric(userID string, metric model.Metric) uint32 {
246-
name := metric[model.MetricNameLabel]
247-
return tokenFor(userID, name)
245+
func tokenForLabels(userID string, labels []cortex.LabelPair) (uint32, error) {
246+
for _, label := range labels {
247+
if label.Name.Equal(labelNameBytes) {
248+
return tokenFor(userID, label.Value), nil
249+
}
250+
}
251+
return 0, fmt.Errorf("No metric name label")
248252
}
249253

250-
func tokenFor(userID string, name model.LabelValue) uint32 {
254+
func tokenFor(userID string, name []byte) uint32 {
251255
h := fnv.New32()
252256
h.Write([]byte(userID))
253-
h.Write([]byte(name))
257+
h.Write(name)
254258
return h.Sum32()
255259
}
256260

257261
type sampleTracker struct {
258-
sample *model.Sample
262+
labels []cortex.LabelPair
263+
sample cortex.Sample
259264
minSuccess int
260265
maxFailures int
261266
succeeded int32
@@ -270,13 +275,30 @@ type pushTracker struct {
270275
}
271276

272277
// Push implements cortex.IngesterServer
273-
func (d *Distributor) Push(ctx context.Context, req *remote.WriteRequest) (*cortex.WriteResponse, error) {
278+
func (d *Distributor) Push(ctx context.Context, req *cortex.WriteRequest) (*cortex.WriteResponse, error) {
274279
userID, err := user.GetID(ctx)
275280
if err != nil {
276281
return nil, err
277282
}
278283

279-
samples := util.FromWriteRequest(req)
284+
// First we flatten out the request into a list of samples.
285+
// We use the heuristic of 1 sample per TS to size the array.
286+
// We also work out the hash value at the same time.
287+
samples := make([]sampleTracker, 0, len(req.Timeseries))
288+
keys := make([]uint32, 0, len(req.Timeseries))
289+
for _, ts := range req.Timeseries {
290+
key, err := tokenForLabels(userID, ts.Labels)
291+
if err != nil {
292+
return nil, err
293+
}
294+
for _, s := range ts.Samples {
295+
keys = append(keys, key)
296+
samples = append(samples, sampleTracker{
297+
labels: ts.Labels,
298+
sample: s,
299+
})
300+
}
301+
}
280302
d.receivedSamples.Add(float64(len(samples)))
281303

282304
if len(samples) == 0 {
@@ -288,27 +310,24 @@ func (d *Distributor) Push(ctx context.Context, req *remote.WriteRequest) (*cort
288310
return nil, errIngestionRateLimitExceeded
289311
}
290312

291-
keys := make([]uint32, len(samples), len(samples))
292-
for i, sample := range samples {
293-
keys[i] = tokenForMetric(userID, sample.Metric)
294-
}
295-
296-
ingesters, err := d.ring.BatchGet(keys, d.cfg.ReplicationFactor, ring.Write)
297-
if err != nil {
313+
var ingesters [][]*ring.IngesterDesc
314+
if err := instrument.TimeRequestHistogram(ctx, "Distributor.Push[ring-lookup]", nil, func(ctx context.Context) error {
315+
var err error
316+
ingesters, err = d.ring.BatchGet(keys, d.cfg.ReplicationFactor, ring.Write)
317+
if err != nil {
318+
return err
319+
}
320+
return nil
321+
}); err != nil {
298322
return nil, err
299323
}
300324

301-
sampleTrackers := make([]sampleTracker, len(samples), len(samples))
302325
samplesByIngester := map[*ring.IngesterDesc][]*sampleTracker{}
303326
for i := range samples {
304327
// We need a response from a quorum of ingesters, which is n/2 + 1.
305328
minSuccess := (len(ingesters[i]) / 2) + 1
306-
307-
sampleTrackers[i] = sampleTracker{
308-
sample: samples[i],
309-
minSuccess: minSuccess,
310-
maxFailures: len(ingesters[i]) - minSuccess,
311-
}
329+
samples[i].minSuccess = minSuccess
330+
samples[i].maxFailures = len(ingesters[i]) - minSuccess
312331

313332
// Skip those that have not heartbeated in a while. NB these are still
314333
// included in the calculation of minSuccess, so if too many failed ingesters
@@ -322,14 +341,14 @@ func (d *Distributor) Push(ctx context.Context, req *remote.WriteRequest) (*cort
322341

323342
// This is just a shortcut - if there are not minSuccess available ingesters,
324343
// after filtering out dead ones, don't even bother trying.
325-
if len(liveIngesters) < sampleTrackers[i].minSuccess {
344+
if len(liveIngesters) < minSuccess {
326345
return nil, fmt.Errorf("wanted at least %d live ingesters to process write, had %d",
327-
sampleTrackers[i].minSuccess, len(liveIngesters))
346+
minSuccess, len(liveIngesters))
328347
}
329348

330349
for _, liveIngester := range liveIngesters {
331350
sampleForIngester := samplesByIngester[liveIngester]
332-
samplesByIngester[liveIngester] = append(sampleForIngester, &sampleTrackers[i])
351+
samplesByIngester[liveIngester] = append(sampleForIngester, &samples[i])
333352
}
334353
}
335354

@@ -395,17 +414,24 @@ func (d *Distributor) sendSamples(ctx context.Context, ingester *ring.IngesterDe
395414
}
396415
}
397416

398-
func (d *Distributor) sendSamplesErr(ctx context.Context, ingester *ring.IngesterDesc, sampleTrackers []*sampleTracker) error {
417+
func (d *Distributor) sendSamplesErr(ctx context.Context, ingester *ring.IngesterDesc, samples []*sampleTracker) error {
399418
client, err := d.getClientFor(ingester)
400419
if err != nil {
401420
return err
402421
}
403-
samples := make([]*model.Sample, len(sampleTrackers), len(sampleTrackers))
404-
for i := range sampleTrackers {
405-
samples[i] = sampleTrackers[i].sample
422+
423+
req := &cortex.WriteRequest{
424+
Timeseries: make([]cortex.TimeSeries, 0, len(samples)),
406425
}
426+
for _, s := range samples {
427+
req.Timeseries = append(req.Timeseries, cortex.TimeSeries{
428+
Labels: s.labels,
429+
Samples: []cortex.Sample{s.sample},
430+
})
431+
}
432+
407433
err = instrument.TimeRequestHistogram(ctx, "Distributor.sendSamples", d.sendDuration, func(ctx context.Context) error {
408-
_, err := client.Push(ctx, util.ToWriteRequest(samples))
434+
_, err := client.Push(ctx, req)
409435
return err
410436
})
411437
d.ingesterAppends.WithLabelValues(ingester.Addr).Inc()
@@ -434,7 +460,7 @@ func (d *Distributor) Query(ctx context.Context, from, to model.Time, matchers .
434460
return err
435461
}
436462

437-
ingesters, err := d.ring.Get(tokenFor(userID, metricName), d.cfg.ReplicationFactor, ring.Read)
463+
ingesters, err := d.ring.Get(tokenFor(userID, []byte(metricName)), d.cfg.ReplicationFactor, ring.Read)
438464
if err != nil {
439465
return err
440466
}

distributor/distributor_test.go

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"github.com/prometheus/client_golang/prometheus"
99
"github.com/prometheus/common/model"
1010
"github.com/prometheus/prometheus/storage/metric"
11-
"github.com/prometheus/prometheus/storage/remote"
1211
"github.com/stretchr/testify/assert"
1312
"golang.org/x/net/context"
1413
"google.golang.org/grpc"
@@ -44,7 +43,7 @@ type mockIngester struct {
4443
happy bool
4544
}
4645

47-
func (i mockIngester) Push(ctx context.Context, in *remote.WriteRequest, opts ...grpc.CallOption) (*cortex.WriteResponse, error) {
46+
func (i mockIngester) Push(ctx context.Context, in *cortex.WriteRequest, opts ...grpc.CallOption) (*cortex.WriteResponse, error) {
4847
if !i.happy {
4948
return nil, fmt.Errorf("Fail")
5049
}
@@ -56,15 +55,15 @@ func (i mockIngester) Query(ctx context.Context, in *cortex.QueryRequest, opts .
5655
return nil, fmt.Errorf("Fail")
5756
}
5857
return &cortex.QueryResponse{
59-
Timeseries: []*remote.TimeSeries{
58+
Timeseries: []cortex.TimeSeries{
6059
{
61-
Labels: []*remote.LabelPair{
60+
Labels: []cortex.LabelPair{
6261
{
63-
Name: "__name__",
64-
Value: "foo",
62+
Name: []byte("__name__"),
63+
Value: []byte("foo"),
6564
},
6665
},
67-
Samples: []*remote.Sample{
66+
Samples: []cortex.Sample{
6867
{
6968
Value: 0,
7069
TimestampMs: 0,
@@ -169,16 +168,16 @@ func TestDistributorPush(t *testing.T) {
169168
}
170169
defer d.Stop()
171170

172-
request := &remote.WriteRequest{}
171+
request := &cortex.WriteRequest{}
173172
for i := 0; i < tc.samples; i++ {
174-
ts := &remote.TimeSeries{
175-
Labels: []*remote.LabelPair{
176-
{"__name__", "foo"},
177-
{"bar", "baz"},
178-
{"sample", fmt.Sprintf("%d", i)},
173+
ts := cortex.TimeSeries{
174+
Labels: []cortex.LabelPair{
175+
{[]byte("__name__"), []byte("foo")},
176+
{[]byte("bar"), []byte("baz")},
177+
{[]byte("sample"), []byte(fmt.Sprintf("%d", i))},
179178
},
180179
}
181-
ts.Samples = []*remote.Sample{
180+
ts.Samples = []cortex.Sample{
182181
{
183182
Value: float64(i),
184183
TimestampMs: int64(i),

0 commit comments

Comments
 (0)