Skip to content

Commit

Permalink
Track leader election status (#3101)
Browse files Browse the repository at this point in the history
  • Loading branch information
xvello authored Feb 15, 2019
1 parent feffb72 commit 70ee0a9
Show file tree
Hide file tree
Showing 15 changed files with 549 additions and 13 deletions.
8 changes: 8 additions & 0 deletions datadog_checks_base/datadog_checks/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .__about__ import __version__
from .checks import AgentCheck
from .checks.openmetrics import OpenMetricsBaseCheck

from .config import is_affirmative
from .errors import ConfigurationError
from .utils.common import ensure_bytes, ensure_unicode, to_string
Expand All @@ -14,9 +15,16 @@
except ImportError:
PDHBaseCheck = None

# Kubernetes dep will not always be installed
try:
from .checks.kube_leader import KubeLeaderElectionBaseCheck
except ImportError:
KubeLeaderElectionBaseCheck = None

__all__ = [
'__version__',
'AgentCheck',
'KubeLeaderElectionBaseCheck',
'OpenMetricsBaseCheck',
'PDHBaseCheck',
'ConfigurationError',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

from .base_check import KubeLeaderElectionBaseCheck
from .mixins import KubeLeaderElectionMixin
from .record import ElectionRecord

__all__ = ['KubeLeaderElectionMixin', 'ElectionRecord', 'KubeLeaderElectionBaseCheck']
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
from .mixins import KubeLeaderElectionMixin
from .. import AgentCheck
from ...errors import CheckException


class KubeLeaderElectionBaseCheck(KubeLeaderElectionMixin, AgentCheck):
"""
KubeLeaderElectioBaseCheck is a class that helps instantiating a Kube Leader
Election mixin only with YAML configurations.
Example configuration::
instances:
- namespace (prefix for the metrics and check)
record_kind (endpoints or configmap)
record_name
record_namespace
tags (optional)
"""
def check(self, instance):
self.check_election_status(instance)
108 changes: 108 additions & 0 deletions datadog_checks_base/datadog_checks/base/checks/kube_leader/mixins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

from kubernetes import client, config

try:
import datadog_agent
except ImportError:
from ...stubs import datadog_agent

from .. import AgentCheck
from .record import ElectionRecord

# Known names of the leader election annotation,
# will be tried in the order of the list
ELECTION_ANNOTATION_NAMES = ["control-plane.alpha.kubernetes.io/leader"]


class KubeLeaderElectionMixin(object):
"""
This mixin uses the facilities of the AgentCheck class
"""

def check_election_status(self, config):
"""
Retrieves the leader-election annotation from a given object, and
submits metrics and a service check.
An integration warning is sent if the object is not retrievable,
or no record is found. Monitors on the service-check should have
no-data alerts enabled to account for this.
The config objet requires the following fields:
namespace (prefix for the metrics and check)
record_kind (endpoints or configmap)
record_name
record_namespace
tags (optional)
It reads the following agent configuration:
kubernetes_kubeconfig_path: defaut is to use in-cluster config
"""
try:
record = self._get_record(
config.get("record_kind", ""), config.get("record_name", ""), config.get("record_namespace", "")
)
self._report_status(config, record)
except Exception as e:
self.warning("Cannot retrieve leader election record {}: {}".format(config.get("record_name", ""), e))

@staticmethod
def _get_record(kind, name, namespace):
kubeconfig_path = datadog_agent.get_config('kubernetes_kubeconfig_path')
if kubeconfig_path:
config.load_kube_config(config_file=kubeconfig_path)
else:
config.load_incluster_config()
v1 = client.CoreV1Api()

obj = None
if kind.lower() in ["endpoints", "endpoint", "ep"]:
obj = v1.read_namespaced_endpoints(name, namespace)
elif kind.lower() in ["configmap", "cm"]:
obj = v1.read_namespaced_config_map(name, namespace)
else:
raise ValueError("Unknown kind {}".format(kind))

if not obj:
raise ValueError("Empty input object")

try:
annotations = obj.metadata.annotations
except AttributeError:
raise ValueError("Invalid input object type")

for name in ELECTION_ANNOTATION_NAMES:
if name in annotations:
return ElectionRecord(annotations[name])

# Could not find annotation
raise ValueError("Object has no leader election annotation")

def _report_status(self, config, record):
# Compute prefix for gauges and service check
prefix = config.get("namespace") + ".leader_election"

# Compute tags for gauges and service check
tags = []
for n in ["record_kind", "record_name", "record_namespace"]:
if n in config:
tags.append("{}:{}".format(n, config[n]))
tags += config.get("tags", [])

# Sanity check on the record
valid, reason = record.validate()
if not valid:
self.service_check(prefix + ".status", AgentCheck.CRITICAL, tags=tags, message=reason)
return # Stop here

# Report metrics
self.monotonic_count(prefix + ".transitions", record.transitions, tags)
self.gauge(prefix + ".lease_duration", record.lease_duration, tags)

leader_status = AgentCheck.OK
if record.seconds_until_renew + record.lease_duration < 0:
leader_status = AgentCheck.CRITICAL
self.service_check(prefix + ".status", leader_status, tags=tags, message=record.summary)
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

import json
from datetime import datetime

from kubernetes.config.dateutil import parse_rfc3339

# If these fields are missing or empty, the service check
# will fail inconditionnaly. Fields taken from
# https://godoc.org/k8s.io/client-go/tools/leaderelection/resourcelock#LeaderElectionRecord
REQUIRED_FIELDS = [
("holderIdentity", "no current leader recorded"),
("leaseDurationSeconds", "no lease duration set"),
("renewTime", "no renew time set"),
("acquireTime", "no acquire time recorded"),
]


class ElectionRecord(object):
def __init__(self, record_string):
self._record = json.loads(record_string)

def validate(self):
reason_prefix = "Invalid record: "
# Test for required fields
for field, message in REQUIRED_FIELDS:
if field not in self._record or not self._record[field]:
return False, reason_prefix + message

if not self.renew_time:
return False, reason_prefix + "bad format for renewTime field"
if not self.acquire_time:
return False, reason_prefix + "bad format for acquireTime field"

# No issue, record is valid
return True, None

@property
def leader_name(self):
return self._record["holderIdentity"]

@property
def lease_duration(self):
return int(self._record["leaseDurationSeconds"])

@property
def renew_time(self):
try:
return parse_rfc3339(self._record.get("renewTime"))
except Exception:
return None

@property
def acquire_time(self):
try:
return parse_rfc3339(self._record.get("acquireTime"))
except Exception:
return None

@property
def transitions(self):
return self._record.get("leaderTransitions", 0)

@property
def seconds_until_renew(self):
"""
Returns the number of seconds between the current time
and the set renew time. It can be negative if the
leader election is running late.
"""
delta = self.renew_time - datetime.now(self.renew_time.tzinfo)
return delta.total_seconds()

@property
def summary(self):
return "Leader: {} since {}, next renew {}".format(self.leader_name, self.acquire_time, self.renew_time)
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jaydebeapi==1.1.1
jpype1==0.6.3
kafka-python==1.4.4; sys_platform != 'win32'
kazoo==2.6.0; sys_platform != 'win32'
kubernetes==8.0.1
ldap3==2.5
lxml==4.2.6
meld3==1.0.2
Expand Down
1 change: 1 addition & 0 deletions datadog_checks_base/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ddtrace==0.13.0
kubernetes==8.0.1
pyyaml==3.13
prometheus-client==0.3.0
protobuf==3.5.1
Expand Down
16 changes: 13 additions & 3 deletions datadog_checks_base/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,18 @@
LONG_DESC = f.read()


def get_requirements(fpath):
def get_requirements(fpath, exclude=[], only=[]):
with open(path.join(HERE, fpath), encoding='utf-8') as f:
return f.readlines()
requirements = []
for line in f:
name = line.split("==")[0]
if only:
if name in only:
requirements.append(line.rstrip())
else:
if name not in exclude:
requirements.append(line.rstrip())
return requirements


setup(
Expand Down Expand Up @@ -51,6 +60,7 @@ def get_requirements(fpath):
include_package_data=True,

extras_require={
'deps': get_requirements('requirements.in'),
'deps': get_requirements('requirements.in', exclude=['kubernetes']),
'kube': get_requirements('requirements.in', only=['kubernetes']),
},
)
Loading

0 comments on commit 70ee0a9

Please sign in to comment.