Skip to content

CP-8403 Adding Telegraf-based metric collection. #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions cmd/estat.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def die(*args, **kwargs):
-q/-Q enable/disable latency histograms by size (default: off)
-y/-Y enable/disable the summary output (default: on)
-t/-T enable/disable emitting the summary total (default: on)
-j set output mode to JSON
-d LEVEL set BCC debug level
-e emit the resulting eBPF script without executing it

Expand All @@ -111,7 +112,6 @@ def die(*args, **kwargs):
particular the time spent allocating a block and time spent waiting for
the write I/O to complete. If POOL is not specified, defaults to tracing
the pool 'domain0'.

"""


Expand Down Expand Up @@ -149,6 +149,7 @@ def usage(msg):
script_arg = None
debug_level = 0
dump_bpf = False
output_mode = BCCHelper.ESTAT_PRINT_MODE


class Args:
Expand All @@ -161,6 +162,7 @@ class Args:
setattr(args, "latsize_hist", False)
setattr(args, "summary", True)
setattr(args, "total", True)
setattr(args, "json", False)

#
# We use getopt rather than argparse because it is very difficult to get
Expand All @@ -170,7 +172,7 @@ class Args:
# arguments.
#
try:
opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLzZqQyYnNtTd:e")
opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLjzZqQyYnNtTd:e")
except getopt.GetoptError as err:
die(err)

Expand All @@ -194,6 +196,7 @@ class Args:
dump_bpf = True
else:
switches = {'-l': "lat_hist",
'-j': "json",
'-z': "size_hist",
'-q': "latsize_hist",
'-y': "summary",
Expand All @@ -219,6 +222,9 @@ class Args:
if not (args.lat_hist or args.size_hist or args.latsize_hist):
args.lat_hist = True

if args.json:
output_mode = BCCHelper.ANALYTICS_PRINT_MODE

# Now that we are done parsing arguments, construct the text of the BPF program
try:
with open(base_dir + 'bpf/estat/' + program + '.c', 'r') as prog_file:
Expand Down Expand Up @@ -443,7 +449,7 @@ class Args:
probe_type + "'")

if args.lat_hist or args.size_hist or args.summary:
helper1 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
helper1 = BCCHelper(b, output_mode)
helper1.add_key_type("name")
helper1.add_key_type("axis")

Expand All @@ -465,23 +471,24 @@ class Args:
"bytes")

if args.latsize_hist:
helper2 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
helper2 = BCCHelper(b, output_mode)
helper2.add_aggregation("latsq", BCCHelper.LL_HISTOGRAM_AGGREGATION,
"microseconds")
helper2.add_key_type("size")
helper2.add_key_type("name")
helper2.add_key_type("axis")

if args.summary and args.total:
helper3 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
helper3 = BCCHelper(b, output_mode)
helper3.add_aggregation("opst", BCCHelper.COUNT_AGGREGATION, "iops(/s)")
helper3.add_aggregation("datat", BCCHelper.SUM_AGGREGATION,
"throughput(k/s)")
helper3.add_key_type("name")

# Need real time;
print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate this line
print(" Tracing enabled... Hit Ctrl-C to end.")
if not args.json:
print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate line
print(" Tracing enabled... Hit Ctrl-C to end.")

# output
if monitor:
Expand All @@ -508,7 +515,8 @@ class Args:
helper1.printall(clear_data)
if args.summary and args.total:
helper3.printall(clear_data)
print("%-16s\n" % strftime("%D - %H:%M:%S %Z"))
if not args.json:
print("%-16s\n" % strftime("%D - %H:%M:%S %Z"))
except Exception as e:
die(e)
else:
Expand Down
3 changes: 3 additions & 0 deletions debian/rules
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ override_dh_auto_install:
dh_install build/cmd/* /usr/bin
dh_install lib/* /usr/share/performance-diagnostics/lib
dh_install bpf/* /usr/share/performance-diagnostics/bpf
dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin
dh_install telegraf/delphix-telegraf.service /lib/systemd/system
dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf
34 changes: 34 additions & 0 deletions telegraf/delphix-telegraf-service
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
BASE_CONFIG=/etc/telegraf/telegraf.base
DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose
PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook
PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED
TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf


function engine_is_object_based() {
zdb -C | grep "type: 'object_store'" >/dev/null
[[ "$?" == "0" ]]
}

function playbook_is_enabled() {
[[ -f $PLAYBOOK_FLAG ]]
}

rm -f $TELEGRAF_CONFIG

if engine_is_object_based; then
if playbook_is_enabled; then
cat $PLAYBOOK_INPUTS $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
else
cat $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
fi
else
if playbook_is_enabled; then
cat $PLAYBOOK_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
else
cat $BASE_CONFIG > $TELEGRAF_CONFIG
fi
fi

/usr/bin/telegraf -config $TELEGRAF_CONFIG
18 changes: 18 additions & 0 deletions telegraf/delphix-telegraf.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[Unit]
Description=Delphix Telegraf Metric Collection Agent
Documentation=https://github.com/influxdata/telegraf
PartOf=delphix.target
After=delphix-platform.service
PartOf=delphix-platform.service

[Service]
EnvironmentFile=-/etc/default/telegraf
User=root
ExecStart=/usr/bin/delphix-telegraf-service
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartForceExitStatus=SIGPIPE
KillMode=control-group

[Install]
WantedBy=delphix.target
3 changes: 3 additions & 0 deletions telegraf/nfs-threads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
nfs_threads | egrep --line-buffered -v "thr"

53 changes: 53 additions & 0 deletions telegraf/perf_playbook
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash
#
# Copyright (c) 2021 by Delphix. All rights reserved.
#
# Script that enables and, disables the Performance Playbook configuration for
# metric collection by Telegraf
#

PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED

#
# Make sure this can only be run as root.
#
function die() {
echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2
exit 1
}

[[ $EUID -ne 0 ]] && die "must be run as root"

#
# Process command.
#

function usage() {
echo "$(basename $0): $*" >&2
echo "Usage: $(basename $0) [enable|disable]"
exit 2
}

function enable_playbook() {
date
echo "Enabling Performance Playbook Metrics"
touch $PLAYBOOK_FLAG
systemctl restart delphix-telegraf
}

function disable_playbook() {
date
echo "Disabling Performance Playbook Metrics"
rm -rf $PLAYBOOK_FLAG
systemctl restart delphix-telegraf
}

if [[ $# -ne 1 ]]; then
usage
fi

case "$1" in
enable) enable_playbook ;;
disable) disable_playbook ;;
*) usage ;;
esac
131 changes: 131 additions & 0 deletions telegraf/telegraf.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Telegraf Configuration
#
# Configuration for telegraf agent
[agent]
interval = "10s"
round_interval = true
flush_interval = "10s"
metric_batch_size = 1000
metric_buffer_limit = 10000

###############################################################################
# OUTPUT PLUGINS #
###############################################################################
# Define the main metric output file, excluding aggregated stats and
# Performance Playbook (estat) data.
[[outputs.file]]
files = ["/var/log/telegraf/metrics.json"]
rotation_max_size = "50MB"
rotation_max_archives = 9
data_format = "json"
namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*"]

# Define output file for ZFS related metrics
[[outputs.file]]
files = ["/var/log/telegraf/metrics_zfs.json"]
rotation_max_size = "30MB"
rotation_max_archives = 5
data_format = "json"
namepass = ["zpool*", "zcache*", "zfs"]

# Define output file for Performance Playbook (estat) metrics
[[outputs.file]]
files = ["/var/log/telegraf/metrics_estat.json"]
rotation_max_size = "30MB"
rotation_max_archives = 5
data_format = "json"
namepass = ["*estat_*"]

# Define output file for aggregate statistics
[[outputs.file]]
files = ["/var/log/telegraf/metric_aggregates.json"]
rotation_max_size = "30MB"
rotation_max_archives = 5
data_format = "json"
namepass = ["agg_*"]

# Enable Live Monitoring, intended for internal use:
#[[outputs.influxdb]]
# urls = ["http://dbsvr.company.com:8086"]
# database = "live_metrics"
# skip_database_creation = true
# data_format = "influx"

###############################################################################
# INPUT PLUGINS #
###############################################################################

# Get CPU usage
[[inputs.cpu]]
percpu = true
totalcpu = true
collect_cpu_time = false
report_active = false
fieldpass = ["usage*"]

# Get mount point stats
[[inputs.disk]]
mount_points = ["/","/domain0"]

# Get disk I/O stats
[[inputs.diskio]]

# Track stats for the current metric files
[[inputs.filestat]]
files = ["/var/log/telegraf/metrics.json",
"/var/log/telegraf/metrics_estat.json",
"/var/log/telegraf/metrics_zfs.json",
"/var/log/telegraf/metric_aggregates.json"]

# Get Memory stats
[[inputs.mem]]

# Get some network interface stats
[[inputs.net]]
fieldpass = ["tcp*","bytes*","packets*","err*","drop*"]

# Track CPU and Memory for the "delphix-mgmt" service (and children).
[[inputs.procstat]]
systemd_unit = "delphix-mgmt.service"
include_systemd_children = true
namedrop = ["procstat_lookup"]
fieldpass = ["memory_usage", "cpu_usage", "memory_rss"]

# Track CPU and Memory for the "zfs-object-agent" service (and children).
[[inputs.procstat]]
systemd_unit = "zfs-object-agent.service"
include_systemd_children = true
namedrop = ["procstat_lookup"]
fieldpass = ["memory_usage", "cpu_usage", "memory_rss"]

# Get process counts
[[inputs.processes]]

# Get swap memory usage
[[inputs.swap]]

# Get misc 'other' stats (load and uptime)
[[inputs.system]]

# ZFS kstats (arcstat, abdstat, zfetch, etc)
[[inputs.zfs]]
interval = "1m"

# Detailed ZFS pool metrics from "zpool_influxdb" (noisy)
#[[inputs.exec]]
# commands = ["/usr/lib/x86_64-linux-gnu/zfs/zpool_influxdb"]
# data_format = "influx"

###############################################################################
# AGGREGATION PLUGINS #
###############################################################################
# Filtered aggregate statistics
# Calculate Min, Max, Mean, Std Deviation every hour for selected metrics:
# CPU Usage (%idle)
[[aggregators.basicstats]]
period = "1h"
drop_original = false
stats = ["min", "max", "mean", "stdev"]
name_prefix = "agg_"
namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"]

Loading