chore: update otel example

grab · Nov 9, 2022 · dd858ea · dd858ea
1 parent 1278a80
commit dd858ea
Showing 8 changed files with 164 additions and 22 deletions.
diff --git a/example/otel/README.md b/example/otel/README.md
@@ -40,9 +40,25 @@ UPTRACE_DSN=http://project2_secret_token@localhost:14317/2 go run client.go
 trace: http://localhost:14318/traces/ee029d8782242c8ed38b16d961093b35
 ```
 
+![Redis trace](./image/redis-trace.png)
+
 You can also open Uptrace UI at [http://localhost:14318](http://localhost:14318) to view available
 spans, logs, and metrics.
 
+## Redis monitoring
+
+You can also [monitor Redis performance](https://uptrace.dev/opentelemetry/redis-monitoring.html)
+metrics By installing OpenTelemetry Collector.
+
+[OpenTelemetry Collector](https://uptrace.dev/opentelemetry/collector.html) is an agent that pulls
+telemetry data from systems you want to monitor and sends it to APM tools using the OpenTelemetry
+protocol (OTLP).
+
+When telemetry data reaches Uptrace, it automatically generates a Redis dashboard from a pre-defined
+template.
+
+![Redis dashboard](./image/metrics.png)
+
 ## Links
 
 - [Uptrace open-source APM](https://uptrace.dev/get/open-source-apm.html)

diff --git a/example/otel/config/alertmanager.yml b/example/otel/config/alertmanager.yml
@@ -0,0 +1,53 @@
+# See https://prometheus.io/docs/alerting/latest/configuration/ for details.
+
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'mailhog:1025'
+  smtp_from: 'alertmanager@example.com'
+  smtp_require_tls: false
+
+receivers:
+  - name: 'team-X'
+    email_configs:
+      - to: 'some-receiver@example.com'
+        send_resolved: true
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h
+
+  # A default receiver
+  receiver: team-X
+
+  # All the above attributes are inherited by all child routes and can
+  # overwritten on each.
+
+  # The child route trees.
+  routes:
+    # This route matches error alerts created from spans or logs.
+    - matchers:
+        - alert_kind="error"
+      group_interval: 24h
+      receiver: team-X
+
+# The directory from which notification templates are read.
+templates:
+  - '/etc/alertmanager/template/*.tmpl'
diff --git a/example/otel/otel-collector.yaml → example/otel/config/otel-collector.yaml b/example/otel/otel-collector.yaml → example/otel/config/otel-collector.yaml
diff --git a/example/otel/vector.toml → example/otel/config/vector.toml b/example/otel/vector.toml → example/otel/config/vector.toml
diff --git a/example/otel/docker-compose.yml b/example/otel/docker-compose.yml
@@ -18,7 +18,7 @@ services:
       - '9000:9000'
 
   uptrace:
-    image: 'uptrace/uptrace:1.1.0'
+    image: 'uptrace/uptrace:1.2.0'
     #image: 'uptrace/uptrace-dev:latest'
     restart: on-failure
     volumes:
@@ -36,19 +36,34 @@ services:
   otel-collector:
     image: otel/opentelemetry-collector-contrib:0.58.0
     restart: on-failure
-    user: '0:0' # required for logs
     volumes:
-      - ./otel-collector.yaml:/etc/otelcol-contrib/config.yaml
-      - /var/lib/docker/containers:/var/lib/docker/containers:ro
-      - /var/log:/var/log:ro
+      - ./config/otel-collector.yaml:/etc/otelcol-contrib/config.yaml
     ports:
       - '4317:4317'
       - '4318:4318'
 
   vector:
     image: timberio/vector:0.24.X-alpine
     volumes:
-      - ./vector.toml:/etc/vector/vector.toml:ro
+      - ./config/vector.toml:/etc/vector/vector.toml:ro
+
+  alertmanager:
+    image: prom/alertmanager:v0.24.0
+    restart: on-failure
+    volumes:
+      - ./config/alertmanager.yml:/etc/alertmanager/config.yml
+      - alertmanager_data:/alertmanager
+    ports:
+      - 9093:9093
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+      - '--storage.path=/alertmanager'
+
+  mailhog:
+    image: mailhog/mailhog:v1.0.1
+    restart: on-failure
+    ports:
+      - '8025:8025'
 
   redis-server:
     image: redis

diff --git a/example/otel/image/metrics.png b/example/otel/image/metrics.png
diff --git a/example/otel/image/redis-trace.png b/example/otel/image/redis-trace.png
diff --git a/example/otel/uptrace.yml b/example/otel/uptrace.yml
@@ -13,6 +13,16 @@
 ##   foo: $$FOO_BAR
 ##
 
+##
+## ClickHouse database credentials.
+##
+ch:
+  # Connection string for ClickHouse database. For example:
+  # clickhouse://<user>:<password>@<host>:<port>/<database>?sslmode=disable
+  #
+  # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options
+  dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable'
+
 ##
 ## A list of pre-configured projects. Each project is fully isolated.
 ##
@@ -26,6 +36,10 @@ projects:
       - service.name
       - host.name
       - deployment.environment
+    # Group spans by deployment.environment attribute.
+    group_by_env: false
+    # Group funcs spans by service.name attribute.
+    group_funcs_by_service: false
 
   # Other projects can be used to monitor your applications.
   # To monitor micro-services or multiple related services, use a single project.
@@ -36,6 +50,49 @@ projects:
       - service.name
       - host.name
       - deployment.environment
+    # Group spans by deployment.environment attribute.
+    group_by_env: false
+    # Group funcs spans by service.name attribute.
+    group_funcs_by_service: false
+
+##
+## Create metrics from spans and events.
+##
+metrics_from_spans:
+  - name: uptrace.tracing.spans_duration
+    description: Spans duration (excluding events)
+    instrument: histogram
+    unit: microseconds
+    value: span.duration / 1000
+    attrs:
+      - span.system as system
+      - service.name as service
+      - host.name as host
+      - span.status_code as status
+    where: not span.is_event
+
+  - name: uptrace.tracing.spans
+    description: Spans count (excluding events)
+    instrument: counter
+    unit: 1
+    value: span.count
+    attrs:
+      - span.system as system
+      - service.name as service
+      - host.name as host
+      - span.status_code as status
+    where: not span.is_event
+
+  - name: uptrace.tracing.events
+    description: Events count (excluding spans)
+    instrument: counter
+    unit: 1
+    value: span.count
+    attrs:
+      - span.system as system
+      - service.name as service
+      - host.name as host
+    where: span.is_event
 
 ##
 ## To require authentication, uncomment the following section.
@@ -78,16 +135,6 @@ auth:
   #     # Defaults to 'preferred_username'.
   #     claim: preferred_username
 
-##
-## ClickHouse database credentials.
-##
-ch:
-  # Connection string for ClickHouse database. For example:
-  # clickhouse://<user>:<password>@<host>:<port>/<database>?sslmode=disable
-  #
-  # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options
-  dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable'
-
 ##
 ## Alerting rules for monitoring metrics.
 ##
@@ -102,8 +149,8 @@ alerting:
         - $net_errors > 0 group by host.name
       # for the last 5 minutes
       for: 5m
-      # in the project id=1
-      projects: [1]
+      annotations:
+        summary: '{{ $labels.host_name }} has high number of net errors: {{ $values.net_errors }}'
 
     - name: Filesystem usage >= 90%
       metrics:
@@ -114,15 +161,26 @@ alerting:
         - where device !~ "loop"
         - $fs_usage{state="used"} / $fs_usage >= 0.9
       for: 5m
-      projects: [1]
+      annotations:
+        summary: '{{ $labels.host_name }} has high FS usage: {{ $values.fs_usage }}'
 
     - name: Uptrace is dropping spans
       metrics:
         - uptrace.projects.spans as $spans
       query:
         - $spans{type=dropped} > 0
       for: 1m
-      projects: [1]
+      annotations:
+        summary: 'Uptrace has dropped {{ $values.spans }} spans'
+
+    - name: Always firing (for fun and testing)
+      metrics:
+        - process.runtime.go.goroutines as $goroutines
+      query:
+        - $goroutines >= 0 group by host.name
+      for: 1m
+      annotations:
+        summary: '{{ $labels.host_name }} has high number of goroutines: {{ $values.goroutines }}'
 
   # Create alerts from error logs and span events.
   create_alerts_from_spans:
@@ -139,8 +197,8 @@ alerting:
 ##
 alertmanager_client:
   # AlertManager API endpoints that Uptrace uses to manage alerts.
-  # urls:
-  #   - 'http://alertmanager:9093/api/v2/alerts'
+  urls:
+    - 'http://alertmanager:9093/api/v2/alerts'
 
 ##
 ## Various options to tweak ClickHouse schema.