Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#FROM telegraf:1.0-alpine
FROM telegraf:1.0

# Reset to root user to do some installs
USER root

# Install packages
RUN apt-get update && apt-get -y install \
bash \
curl \
unzip \
netcat-openbsd \
&& rm -rf /var/cache/apt/* /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Add ContainerPilot and its configuration
# Releases at https://github.com/joyent/containerpilot/releases
ENV CONTAINERPILOT_VER 2.3.0
ENV CONTAINERPILOT file:///etc/containerpilot.json

RUN export CONTAINERPILOT_CHECKSUM=ec9dbedaca9f4a7a50762f50768cbc42879c7208 \
&& curl --retry 7 --fail -Lso /tmp/containerpilot.tar.gz \
"https://github.com/joyent/containerpilot/releases/download/${CONTAINERPILOT_VER}/containerpilot-${CONTAINERPILOT_VER}.tar.gz" \
&& echo "${CONTAINERPILOT_CHECKSUM} /tmp/containerpilot.tar.gz" | sha1sum -c \
&& tar zxf /tmp/containerpilot.tar.gz -C /usr/local/bin \
&& rm /tmp/containerpilot.tar.gz

# The our helper/glue scripts and configuration for this specific app
COPY bin /usr/local/bin
COPY etc /etc

# Install Consul
# Releases at https://releases.hashicorp.com/consul
RUN export CONSUL_VERSION=0.6.4 \
&& export CONSUL_CHECKSUM=abdf0e1856292468e2c9971420d73b805e93888e006c76324ae39416edcf0627 \
&& curl --retry 7 --fail -vo /tmp/consul.zip "https://releases.hashicorp.com/consul/${CONSUL_VERSION}/consul_${CONSUL_VERSION}_linux_amd64.zip" \
&& echo "${CONSUL_CHECKSUM} /tmp/consul.zip" | sha256sum -c \
&& unzip /tmp/consul -d /usr/local/bin \
&& rm /tmp/consul.zip \
&& mkdir /config

# Create empty directories for Consul config and data
RUN mkdir -p /etc/consul \
&& chown -R root /etc/consul \
&& mkdir -p /var/lib/consul \
&& chown -R root /var/lib/consul

# Install Consul template
# Releases at https://releases.hashicorp.com/consul-template/
RUN export CONSUL_TEMPLATE_VERSION=0.14.0 \
&& export CONSUL_TEMPLATE_CHECKSUM=7c70ea5f230a70c809333e75fdcff2f6f1e838f29cfb872e1420a63cdf7f3a78 \
&& curl --retry 7 --fail -Lso /tmp/consul-template.zip "https://releases.hashicorp.com/consul-template/${CONSUL_TEMPLATE_VERSION}/consul-template_${CONSUL_TEMPLATE_VERSION}_linux_amd64.zip" \
&& echo "${CONSUL_TEMPLATE_CHECKSUM} /tmp/consul-template.zip" | sha256sum -c \
&& unzip /tmp/consul-template.zip -d /usr/local/bin \
&& rm /tmp/consul-template.zip

# Reset entrypoint from base image
ENTRYPOINT []

# Run telegraf
USER root
CMD ["/usr/local/bin/containerpilot", \
"/entrypoint.sh", \
"telegraf", \
"-config", \
"/etc/telegraf.conf"]
92 changes: 92 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,94 @@
# telegraf
Work in progress, not stable, expect force pushes of this repo

containerized telegraf server, based on the official telegraf/1.0 Docker image, adding [ContainerPilot](https://www.joyent.com/containerpilot) to announce this container's telegraf service to a Service Discovery layer, such as Consul or etcd.

### Usage
Include this image in your Docker Compose project, query Consul for it's IP address and use it in your configurations, easily done via [Consul-Template](https://github.com/hashicorp/consul-template). The default ContainerPilot configuration talks to Consul and assumes the IP address to access consul is passed to the container in an envrionment varible, $CONSUL (or via docker link consul)

Configuration of telegraf is managed via ContainerPilot `preStart` or `onChange` handlers.

Telegraf output is convigured with InfluxDB output plugin. By default telegraf is looking for InfluxDB container started in the same cluster, but it's possible to point Telegraf to remove InfluxDB server by uncommenting and setting up INFLUXDB_HOST variable in env.telegraf file

Telegraf input sources configured with prometheus input plugin and represent a list of urls pointing to container-pilot telemetry endpoints (http://container-ip:9090/metrics). Input sources reloaded automatically with `onChange` event handler.

### Configuration

Please run setup.sh to generate required _env file and configure CONSUL env variable.

You can also check _env.telegraf file. By setting INFLUXDB_HOST variable there you can point telegraf to already running instance of InfluxDB (you have to remove influxdb section from docker-compose.yml in this case).

### Hello world example

1. [Get a Joyent account](https://my.joyent.com/landing/signup/) and [add your SSH key](https://docs.joyent.com/public-cloud/getting-started).
1. Install the [Docker Toolbox](https://docs.docker.com/installation/mac/) (including `docker` and `docker-compose`) on your laptop or other environment, as well as the [Joyent Triton CLI](https://www.joyent.com/blog/introducing-the-triton-command-line-tool) (`triton` replaces our old `sdc-*` CLI tools).
1. [Configure Docker and Docker Compose for use with Joyent.](https://docs.joyent.com/public-cloud/api-access/docker)

Check that everything is configured correctly by running `./setup.sh`. This will check that your environment is setup correctly and will create an `_env` file that includes injecting an environment variable for the Consul hostname into the Telegraf and Nginx containers so we can take advantage of [Triton Container Name Service (CNS)](https://www.joyent.com/blog/introducing-triton-container-name-service).

Start everything:

```bash
docker-compose build
docker-compose up -d
```
In result we'll have 4 containers running:
- consul
- telegraf_nginx_1 - nginx web-server is used just for demo purposes to scale and provide telemetry
- influxdb - currently running locally, but it's possible to connect with existing influxdb server
- telegraf

To verify telegraf container status you can check container log (there should be a list of records, which indicate attempts to join new input source):
```bash
docker logs telegraf 2>&1 | grep EventMemberJoin
```
it should display a list of members(input sources) recently added.

Also you check the list of input source urls for telemetry currently used by telegraf with the following command:
```bash
docker exec -i -t telegraf /bin/grep :9090 /etc/telegraf.conf
```
the list of urls includes consul container(first one in outout), telegraf container(localhost) and all other urls are nginx-container urls.
So you can check the number of urls in output, substruct 2 and it should give you a number of nginx containers


Lets scale up number of nginx containers to 3, wait for 15 seconds (give some time to telegraf to reconfigure itself) and check the number of input urls (or EventMemberJoin events in logs)
```bash
docker-compose scale nginx=3
sleep 15

# check source urls
docker exec -i -t telegraf /bin/grep :9090 /etc/telegraf.conf

# check logs
docker logs telegraf 2>&1 | grep EventMemberJoin
```

Lets scale down number of nginx containers to 1, wait for 15 seconds and check the number of input urls again:
```bash
docker-compose scale nginx=1
sleep 15

# check source urls
docker exec -i -t telegraf /bin/grep :9090 /etc/telegraf.conf
```

Finally you can check actual result of telemery aggregation(via telegraf) on InfluxDB server.
You have to open InfluxDB UI with the following command:
```bash
open "http://$(triton ip influxdb):8083/"
```
choose 'telegraf' database in dropdown located on the top-right corner, type and execute a query
```
SHOW MEASUREMENTS
```
there should be a record like 'nginx_connections_load' which represents data coming from nginx telemetry.
And the following query should display a list of nginx specfic telemetry recorods collected during last 5 minutes:
```
SELECT * FROM nginx_connections_load WHERE time > now() - 5m
```

At the end of test you can shutdown containers with the following command:
```
docker-compose kill
```
5 changes: 5 additions & 0 deletions _env.telegraf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# InfluxDB settings
# (uncomment to change default influxdb host/database to custom url)
#INFLUXDB_HOST=influxdb # docker alias or real hostname
#INFLUXDB_DATABASE=telegraf
#INFLUXDB_DATA_ENGINE=tsm1
55 changes: 55 additions & 0 deletions bin/reload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

SERVICE_NAME=${SERVICE_NAME:-telegraf}
CONSUL=${CONSUL:-consul}

# Render Telegraf configuration template using values from Consul,
# but do not reload because Telegraf has't started yet
preStart() {
# sleep 5 # give some time for other containerpilots to start before rendering config
consul-template \
-once \
-dedup \
-consul ${CONSUL}:8500 \
-template "/etc/telegraf.ctmpl:/etc/telegraf.conf"
}

# Render Telegraf configuration template using values from Consul,
# then gracefully reload Telegraf
onChange() {
consul-template \
-once \
-dedup \
-consul ${CONSUL}:8500 \
-template "/etc/telegraf.ctmpl:/etc/telegraf.conf:/usr/local/bin/reload.sh reloadConfig"
}

# Telegraf reload th SIGHUP
# Note: if we fire SIGHUP vs node before it has a chance to register the
# signal handler, then it will immediately exit. This ensures that
# the process is listening on port 8094 which should only be the
# case after we have the signal handler loaded.
reloadConfig() {
while :
do
netstat -ln | grep -q 8094 && pkill -SIGHUP telegraf && break
done
}
help() {
echo "Usage: ./reload.sh preStart => first-run configuration for Telegraf"
echo " ./reload.sh onChange => [default] update Telegraf config on upstream changes"
echo " ./reload.sh reloadConfig => reload Telegraf config on upstream changes"
}

until
cmd=$1
if [ -z "$cmd" ]; then
onChange
fi
shift 1
$cmd "$@"
[ "$?" -ne 127 ]
do
onChange
exit
done
36 changes: 36 additions & 0 deletions bin/sensor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash
set -e

help() {
echo 'Uses cli tools free and top to determine current CPU and memory usage'
echo 'for the telemetry service.'
}

# memory usage in percent
sys_memory() {
# awk oneliner to get memory usage
# free -m | awk 'NR==2{printf "Memory Usage: %s/%sMB (%.2f%%)\n", $3,$2,$3*100/$2 }'
# output:
# Memory Usage: 15804/15959MB (99.03%)
(>&2 echo "sys memory check fired")
local memory=$(free -m | awk 'NR==2{printf "%.2f", $3*100/$2 }')
echo ${memory}
}

# cpu load
sys_cpu() {
# oneliner to display cpu load
# top -bn1 | grep load | awk '{printf "CPU Load: %.2f\n", $(NF-2)}'
(>&2 echo "sys cpu check fired")
local cpuload=$(top -bn1 | grep load | awk '{printf "%.2f", $(NF-2)}')
echo ${cpuload}
}

cmd=$1
if [ ! -z "$cmd" ]; then
shift 1
$cmd "$@"
exit
fi

help
78 changes: 78 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Telegraf autopilotpattern demo

# Consul - start with a single host which will bootstrap the cluster.
# In production we'll want to use an HA cluster.
consul:
container_name: consul
image: progrium/consul:latest

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to run progrium/consul over the official consul image?

restart: always
mem_limit: 128m
expose:
- 53
- 8300
- 8301
- 8302
- 8400
- 8500
ports:
- 8500 # expose only Consul's UI on the public IP
dns:
- 127.0.0.1
labels:
- triton.cns.services=consul
command: -server -bootstrap -ui-dir /ui

# Telegraf should autodiscovery all autopilot containers,
# collect telemetry from there, and send output to influxdb.
telegraf:
container_name: telegraf
#image: autopilotpattern/telegraf
build: . # for now use local image build
mem_limit: 128m
expose:
- 8094 # necessary for healthcheck
- 9090 # so we can see telemetry
restart: always
links:
- consul:consul
- influxdb:influxdb
environment:
- CONSUL_AGENT=1
- CONSUL=consul
- INFLUXDB=influxdb
env_file:
- _env
- _env.telegraf
labels:
- triton.cns.services=telegraf

# InfluxDB
influxdb:
container_name: influxdb
image: influxdb:latest
mem_limit: 512m
ports:
- "8083:8083"
- "8086:8086"
- "8090:8090"
env_file:
- '_env'

# Ngix included for demonstration of input sources for telegraf
# we can scale it and demonstrate auto capturing of nginx containers telemetry by telegraf
nginx:
image: autopilotpattern/nginx
restart: always
mem_limit: 128m
ports:
- 80 # http port
- 9090 # so we can see telemetry
links:
- consul:consul
environment:
- CONSUL_AGENT=1
- CONSUL=consul
- BACKEND=consul # backend is required, so lets point to consul just for demo purposes
env_file: _env
labels:
- triton.cns.services=nginx
50 changes: 50 additions & 0 deletions etc/containerpilot.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"consul": "{{ if .CONSUL_AGENT }}localhost{{ else }}{{ .CONSUL }}{{ end }}:8500",
"preStart": "/usr/local/bin/reload.sh preStart",
"services": [
{
"name": "telegraf",
"port": 8094,
"health": "nc -vz localhost 8094",
"poll": 10,
"ttl": 25
}
],
"backends": [
{
"name": "nginx",
"poll": 7,
"onChange": "/usr/local/bin/reload.sh onChange"
}
],
"coprocesses": [{{ if .CONSUL_AGENT }}
{
"command": ["/usr/local/bin/consul", "agent",
"-data-dir=/var/lib/consul",
"-config-dir=/etc/consul",
"-rejoin",
"-retry-join", "{{ .CONSUL }}",
"-retry-max", "10",
"-retry-interval", "10s"],
"restarts": "unlimited"
}{{ end }}],
"telemetry": {
"port": 9090,
"sensors": [
{
"name": "telegraf_sys_memory_percent",
"help": "percentage of memory used",
"type": "gauge",
"poll": 5,
"check": ["/usr/local/bin/sensor.sh", "sys_memory"]
},
{
"name": "telegraf_sys_cpu_load",
"help": "cpu load",
"type": "gauge",
"poll": 5,
"check": ["/usr/local/bin/sensor.sh", "sys_cpu"]
}
]
}
}
Loading