Skip to content

Commit

Permalink
Add Zipkin input plugin (#3080)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnelson authored Aug 3, 2017
1 parent ce12913 commit 137b312
Show file tree
Hide file tree
Showing 18 changed files with 2,469 additions and 0 deletions.
7 changes: 7 additions & 0 deletions Godeps
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ github.com/Shopify/sarama c01858abb625b73a3af51d0798e4ad42c8147093
github.com/Sirupsen/logrus 61e43dc76f7ee59a82bdf3d71033dc12bea4c77d
github.com/aerospike/aerospike-client-go 95e1ad7791bdbca44707fedbb29be42024900d9c
github.com/amir/raidman c74861fe6a7bb8ede0a010ce4485bdbb4fc4c985
github.com/apache/thrift 4aaa92ece8503a6da9bc6701604f69acf2b99d07
github.com/aws/aws-sdk-go c861d27d0304a79f727e9a8a4e2ac1e74602fdc0
github.com/beorn7/perks 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
github.com/bsm/sarama-cluster ccdc0803695fbce22f1706d04ded46cd518fd832
Expand All @@ -17,10 +18,13 @@ github.com/eapache/go-resiliency b86b1ec0dd4209a588dc1285cdd471e73525c0b3
github.com/eapache/go-xerial-snappy bb955e01b9346ac19dc29eb16586c90ded99a98c
github.com/eapache/queue 44cc805cf13205b55f69e14bcb69867d1ae92f98
github.com/eclipse/paho.mqtt.golang d4f545eb108a2d19f9b1a735689dbfb719bc21fb
github.com/go-logfmt/logfmt 390ab7935ee28ec6b286364bba9b4dd6410cb3d5
github.com/go-sql-driver/mysql 2e00b5cd70399450106cec6431c2e2ce3cae5034
github.com/gobwas/glob bea32b9cd2d6f55753d94a28e959b13f0244797a
github.com/gogo/protobuf 7b6c6391c4ff245962047fc1e2c6e08b1cdfa0e8
github.com/golang/protobuf 8ee79997227bf9b34611aee7946ae64735e6fd93
github.com/golang/snappy 7db9049039a047d955fe8c19b83c8ff5abd765c7
github.com/google/go-cmp f94e52cad91c65a63acc1e75d4be223ea22e99bc
github.com/gorilla/mux 392c28fe23e1c45ddba891b0320b3b5df220beea
github.com/hailocab/go-hostpool e80d13ce29ede4452c43dea11e79b9bc8a15b478
github.com/hashicorp/consul 63d2fc68239b996096a1c55a0d4b400ea4c2583f
Expand All @@ -39,6 +43,9 @@ github.com/nats-io/nats ea9585611a4ab58a205b9b125ebd74c389a6b898
github.com/nats-io/nuid 289cccf02c178dc782430d534e3c1f5b72af807f
github.com/nsqio/go-nsq a53d495e81424aaf7a7665a9d32a97715c40e953
github.com/opencontainers/runc 89ab7f2ccc1e45ddf6485eaa802c35dcf321dfc8
github.com/opentracing-contrib/go-observer a52f2342449246d5bcc273e65cbdcfa5f7d6c63c
github.com/opentracing/opentracing-go 06f47b42c792fef2796e9681353e1d908c417827
github.com/openzipkin/zipkin-go-opentracing 1cafbdfde94fbf2b373534764e0863aa3bd0bf7b
github.com/pierrec/lz4 5c9560bfa9ace2bf86080bf40d46b34ae44604df
github.com/pierrec/xxHash 5a004441f897722c627870a981d02b29924215fa
github.com/pkg/errors 645ef00459ed84a119197bfb8d8205042c6df63d
Expand Down
5 changes: 5 additions & 0 deletions docs/LICENSE_OF_DEPENDENCIES.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ works:
- github.com/eclipse/paho.mqtt.golang [ECLIPSE](https://github.com/eclipse/paho.mqtt.golang/blob/master/LICENSE)
- github.com/fsouza/go-dockerclient [BSD](https://github.com/fsouza/go-dockerclient/blob/master/LICENSE)
- github.com/gobwas/glob [MIT](https://github.com/gobwas/glob/blob/master/LICENSE)
- github.com/google/go-cmp [BSD](https://github.com/google/go-cmp/blob/master/LICENSE)
- github.com/gogo/protobuf [BSD](https://github.com/gogo/protobuf/blob/master/LICENSE)
- github.com/golang/protobuf [BSD](https://github.com/golang/protobuf/blob/master/LICENSE)
- github.com/golang/snappy [BSD](https://github.com/golang/snappy/blob/master/LICENSE)
- github.com/go-logfmt/logfmt [MIT](https://github.com/go-logfmt/logfmt/blob/master/LICENSE)
- github.com/gorilla/mux [BSD](https://github.com/gorilla/mux/blob/master/LICENSE)
- github.com/go-sql-driver/mysql [MPL](https://github.com/go-sql-driver/mysql/blob/master/LICENSE)
- github.com/hailocab/go-hostpool [MIT](https://github.com/hailocab/go-hostpool/blob/master/LICENSE)
Expand All @@ -52,6 +54,9 @@ works:
- github.com/nats-io/nats [MIT](https://github.com/nats-io/nats/blob/master/LICENSE)
- github.com/nats-io/nuid [MIT](https://github.com/nats-io/nuid/blob/master/LICENSE)
- github.com/nsqio/go-nsq [MIT](https://github.com/nsqio/go-nsq/blob/master/LICENSE)
- github.com/opentracing-contrib/go-observer [APACHE](https://github.com/opentracing-contrib/go-observer/blob/master/LICENSE)
- github.com/opentracing/opentracing-go [MIT](https://github.com/opentracing/opentracing-go/blob/master/LICENSE)
- github.com/openzipkin/zipkin-go-opentracing [MIT](https://github.com/openzipkin/zipkin-go-opentracing/blob/master/LICENSE)
- github.com/pierrec/lz4 [BSD](https://github.com/pierrec/lz4/blob/master/LICENSE)
- github.com/pierrec/xxHash [BSD](https://github.com/pierrec/xxHash/blob/master/LICENSE)
- github.com/pkg/errors [BSD](https://github.com/pkg/errors/blob/master/LICENSE)
Expand Down
1 change: 1 addition & 0 deletions plugins/inputs/all/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,6 @@ import (
_ "github.com/influxdata/telegraf/plugins/inputs/webhooks"
_ "github.com/influxdata/telegraf/plugins/inputs/win_perf_counters"
_ "github.com/influxdata/telegraf/plugins/inputs/zfs"
_ "github.com/influxdata/telegraf/plugins/inputs/zipkin"
_ "github.com/influxdata/telegraf/plugins/inputs/zookeeper"
)
164 changes: 164 additions & 0 deletions plugins/inputs/zipkin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Zipkin Plugin

This plugin implements the Zipkin http server to gather trace and timing data needed to troubleshoot latency problems in microservice architectures.

*Please Note: This plugin is experimental; Its data schema may be subject to change
based on its main usage cases and the evolution of the OpenTracing standard.*

## Configuration:
```toml
[[inputs.zipkin]]
path = "/api/v1/spans" # URL path for span data
port = 9411 # Port on which Telegraf listens
```

## Tracing:

This plugin uses Annotations tags and fields to track data from spans

- __TRACE:__ is a set of spans that share a single root span.
Traces are built by collecting all Spans that share a traceId.

- __SPAN:__ is a set of Annotations and BinaryAnnotations that correspond to a particular RPC.

- __Annotations:__ for each annotation & binary annotation of a span a metric is output. *Records an occurrence in time at the beginning and end of a request.*

Annotations may have the following values:

- __CS (client start):__ beginning of span, request is made.
- __SR (server receive):__ server receives request and will start processing it
network latency & clock jitters differ it from cs
- __SS (server send):__ server is done processing and sends request back to client
amount of time it took to process request will differ it from sr
- __CR (client receive):__ end of span, client receives response from server
RPC is considered complete with this annotation

### Tags
* __"id":__ The 64 bit ID of the span.
* __"parent_id":__ An ID associated with a particular child span. If there is no child span, the parent ID is set to ID.
* __"trace_id":__ The 64 or 128-bit ID of a particular trace. Every span in a trace shares this ID. Concatenation of high and low and converted to hexadecimal.
* __"name":__ Defines a span

##### Annotations have these additional tags:

* __"service_name":__ Defines a service
* __"annotation":__ The value of an annotation
* __"endpoint_host":__ Listening port concat with IPV4, if port is not present it will not be concatenated

##### Binary Annotations have these additional tag:

* __"service_name":__ Defines a service
* __"annotation":__ The value of an annotation
* __"endpoint_host":__ Listening port concat with IPV4, if port is not present it will not be concatenated
* __"annotation_key":__ label describing the annotation


### Fields:
* __"duration_ns":__ The time in nanoseconds between the end and beginning of a span.



### Sample Queries:

__Get All Span Names for Service__ `my_web_server`
```sql
SHOW TAG VALUES FROM "zipkin" with key="name" WHERE "service_name" = 'my_web_server'
```
- __Description:__ returns a list containing the names of the spans which have annotations with the given `service_name` of `my_web_server`.

__Get All Service Names__
```sql
SHOW TAG VALUES FROM "zipkin" WITH KEY = "service_name"
```
- __Description:__ returns a list of all `distinct` endpoint service names.

__Find spans with longest duration__
```sql
SELECT max("duration_ns") FROM "zipkin" WHERE "service_name" = 'my_service' AND "name" = 'my_span_name' AND time > now() - 20m GROUP BY "trace_id",time(30s) LIMIT 5
```
- __Description:__ In the last 20 minutes find the top 5 longest span durations for service `my_server` and span name `my_span_name`


### Recommended InfluxDB setup

This test will create high cardinality data so we reccomend using the [tsi influxDB engine](https://www.influxdata.com/path-1-billion-time-series-influxdb-high-cardinality-indexing-ready-testing/).
#### How To Set Up InfluxDB For Work With Zipkin

##### Steps
1. ___Update___ InfluxDB to >= 1.3, in order to use the new tsi engine.

2. ___Generate___ a config file with the following command:
```sh
influxd config > /path/for/config/file
```
3. ___Add___ the following to your config file, under the `[data]` tab:
```toml
[data]
index-version = "tsi1"
```

4. ___Start___ `influxd` with your new config file:
```sh
influxd -config=/path/to/your/config/file
```

5. ___Update___ your retention policy:
```sql
ALTER RETENTION POLICY "autogen" ON "telegraf" DURATION 1d SHARD DURATION 30m
```

### Example Input Trace:

- [Cli microservice with two services Test](https://github.com/openzipkin/zipkin-go-opentracing/tree/master/examples/cli_with_2_services)
- [Test data from distributed trace repo sample json](https://github.com/mattkanwisher/distributedtrace/blob/master/testclient/sample.json)
#### [Trace Example from Zipkin model](http://zipkin.io/pages/data_model.html)
```json
{
"traceId": "bd7a977555f6b982",
"name": "query",
"id": "be2d01e33cc78d97",
"parentId": "ebf33e1a81dc6f71",
"timestamp": 1458702548786000,
"duration": 13000,
"annotations": [
{
"endpoint": {
"serviceName": "zipkin-query",
"ipv4": "192.168.1.2",
"port": 9411
},
"timestamp": 1458702548786000,
"value": "cs"
},
{
"endpoint": {
"serviceName": "zipkin-query",
"ipv4": "192.168.1.2",
"port": 9411
},
"timestamp": 1458702548799000,
"value": "cr"
}
],
"binaryAnnotations": [
{
"key": "jdbc.query",
"value": "select distinct `zipkin_spans`.`trace_id` from `zipkin_spans` join `zipkin_annotations` on (`zipkin_spans`.`trace_id` = `zipkin_annotations`.`trace_id` and `zipkin_spans`.`id` = `zipkin_annotations`.`span_id`) where (`zipkin_annotations`.`endpoint_service_name` = ? and `zipkin_spans`.`start_ts` between ? and ?) order by `zipkin_spans`.`start_ts` desc limit ?",
"endpoint": {
"serviceName": "zipkin-query",
"ipv4": "192.168.1.2",
"port": 9411
}
},
{
"key": "sa",
"value": true,
"endpoint": {
"serviceName": "spanstore-jdbc",
"ipv4": "127.0.0.1",
"port": 3306
}
}
]
}
```
75 changes: 75 additions & 0 deletions plugins/inputs/zipkin/cmd/stress_test_write/stress_test_write.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
This is a development testing cli tool meant to stress the zipkin telegraf plugin.
It writes a specified number of zipkin spans to the plugin endpoint, with other
parameters which dictate batch size and flush timeout.
Usage as follows:
`./stress_test_write -batch_size=<batch_size> -max_backlog=<max_span_buffer_backlog> -batch_interval=<batch_interval_in_seconds> -span_count<number_of_spans_to_write> -zipkin_host=<zipkin_service_hostname>`
Or with a timer:
`time ./stress_test_write -batch_size=<batch_size> -max_backlog=<max_span_buffer_backlog> -batch_interval=<batch_interval_in_seconds> -span_count<number_of_spans_to_write> -zipkin_host=<zipkin_service_hostname>`
However, the flag defaults work just fine for a good write stress test (and are what
this tool has mainly been tested with), so there shouldn't be much need to
manually tweak the parameters.
*/

package main

import (
"flag"
"fmt"
"log"
"time"

zipkin "github.com/openzipkin/zipkin-go-opentracing"
)

var (
BatchSize int
MaxBackLog int
BatchTimeInterval int
SpanCount int
ZipkinServerHost string
)

const usage = `./stress_test_write -batch_size=<batch_size> -max_backlog=<max_span_buffer_backlog> -batch_interval=<batch_interval_in_seconds> -span_count<number_of_spans_to_write> -zipkin_host=<zipkin_service_hostname>`

func init() {
flag.IntVar(&BatchSize, "batch_size", 10000, usage)
flag.IntVar(&MaxBackLog, "max_backlog", 100000, usage)
flag.IntVar(&BatchTimeInterval, "batch_interval", 1, usage)
flag.IntVar(&SpanCount, "span_count", 100000, usage)
flag.StringVar(&ZipkinServerHost, "zipkin_host", "localhost", usage)
}

func main() {
flag.Parse()
var hostname = fmt.Sprintf("http://%s:9411/api/v1/spans", ZipkinServerHost)
collector, err := zipkin.NewHTTPCollector(
hostname,
zipkin.HTTPBatchSize(BatchSize),
zipkin.HTTPMaxBacklog(MaxBackLog),
zipkin.HTTPBatchInterval(time.Duration(BatchTimeInterval)*time.Second))
defer collector.Close()
if err != nil {
log.Fatalf("Error intializing zipkin http collector: %v\n", err)
}

tracer, err := zipkin.NewTracer(
zipkin.NewRecorder(collector, false, "127.0.0.1:0", "trivial"))

if err != nil {
log.Fatalf("Error: %v\n", err)
}

log.Printf("Writing %d spans to zipkin server at %s\n", SpanCount, hostname)
for i := 0; i < SpanCount; i++ {
parent := tracer.StartSpan("Parent")
parent.LogEvent(fmt.Sprintf("Trace%d", i))
parent.Finish()
}
log.Println("Done. Flushing remaining spans...")
}
Loading

0 comments on commit 137b312

Please sign in to comment.