Skip to content

Commit

Permalink
Store and retry forwarding events in case of exceptions (#759)
Browse files Browse the repository at this point in the history
* Refactor forwarding and logs
* Store and retry forwarding events incase of exceptions
* Add a retry mechanism to store events on S3 in case of failure.
* Multiple functions can use the same bucket to store events
* Address comments
* Change retry execution to be triggered on custom invocations only
  • Loading branch information
ge0Aja authored Apr 8, 2024
1 parent 4559eb1 commit 23886fb
Show file tree
Hide file tree
Showing 10 changed files with 306 additions and 84 deletions.
11 changes: 10 additions & 1 deletion aws/logs_monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,13 @@ If you can't install the Forwarder using the provided CloudFormation template, y
4. Set the environment variable `DD_ENHANCED_METRICS` to `false` on the Forwarder. This stops the Forwarder from generating enhanced metrics itself, but it will still forward custom metrics from other lambdas.
5. Some AWS accounts are configured such that triggers will not automatically create resource-based policies allowing Cloudwatch log groups to invoke the forwarder. Reference the [CloudWatchLogPermissions][103] to see which permissions are required for the forwarder to be invoked by Cloudwatch Log Events.
6. [Configure triggers][104].
7. Create an S3 bucket, and set environment variable `DD_S3_BUCKET_NAME` to the bucket name. Also provide `s3:GetObject`, `s3:PutObject`, and `s3:DeleteObject` permissions on this bucket to the Lambda execution role. This bucket is used to store the Lambda tags cache.
7. Create an S3 bucket, and set environment variable `DD_S3_BUCKET_NAME` to the bucket name. Also provide `s3:GetObject`, `s3:PutObject`, `s3:ListBucket`, and `s3:DeleteObject` permissions on this bucket to the Lambda execution role. This bucket is used to store the different tags cache i.e. Lambda, S3, Step Function and Log Group. Additionally, this bucket will be used to store unforwarded events incase of forwarding exceptions.
8. Set environment variable `DD_STORE_FAILED_EVENTS` to `true` to enable the forwarder to also store event data in the S3 bucket. In case of exceptions when sending logs, metrics or traces to intake, the forwarder will store relevant data in the S3 bucket. On custom invocations i.e. on receiving an event with the `retry` keyword set to a non empty string (which can be manually triggered - see below), the forwarder will retry sending the stored events. When successful it will clear up the storage in the bucket.

```bash
aws lambda invoke --function-name <function-name> --payload '{"retry":"true"}' out
```


[101]: https://github.com/DataDog/datadog-serverless-functions/releases
[102]: https://app.datadoghq.com/organization-settings/api-keys
Expand All @@ -138,6 +144,9 @@ If you can't install the Forwarder using the provided CloudFormation template, y

If you encounter issues upgrading to the latest version, check the Troubleshooting section.

### Upgrade an older version to +3.107.0
Starting version 3.107.0 a new feature is added to enable Lambda function to store unforwarded events incase of exceptions on the intake point. If the feature is enabled using `DD_STORE_FAILED_EVENTS` env var, failing events will be stored under a defined dir in the same S3 bucket used to store tags cache. The same bucket can be used to store logs from several Lambda functions under unique subdirs.

### Upgrade an older version to +3.106.0
Starting version 3.106.0 Lambda function has been updated to add a prefix to cache filenames stored in the S3 bucket configured in `DD_S3_BUCKET_NAME`.
This allows to use the same bucket to store cache files from several functions.
Expand Down
201 changes: 129 additions & 72 deletions aws/logs_monitoring/forwarder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
from logs.datadog_client import DatadogClient
from logs.datadog_tcp_client import DatadogTCPClient
from logs.datadog_scrubber import DatadogScrubber
from logs.helpers import filter_logs
from logs.helpers import filter_logs, add_retry_tag
from retry.storage import Storage
from retry.enums import RetryPrefix
from settings import (
DD_API_KEY,
DD_USE_TCP,
Expand All @@ -25,92 +27,147 @@
DD_PORT,
DD_TRACE_INTAKE_URL,
DD_FORWARD_LOG,
DD_STORE_FAILED_EVENTS,
SCRUBBING_RULE_CONFIGS,
INCLUDE_AT_MATCH,
EXCLUDE_AT_MATCH,
)

logger = logging.getLogger()
logger.setLevel(logging.getLevelName(os.environ.get("DD_LOG_LEVEL", "INFO").upper()))
trace_connection = TraceConnection(
DD_TRACE_INTAKE_URL, DD_API_KEY, DD_SKIP_SSL_VALIDATION
)


def forward(logs, metrics, traces):
"""
Forward logs, metrics, and traces to Datadog in a background thread.
"""
if DD_FORWARD_LOG:
_forward_logs(logs)

_forward_metrics(metrics)

if len(traces) > 0:
_forward_traces(traces)


def _forward_logs(logs):
"""Forward logs to Datadog"""
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarding {len(logs)} logs")
logs_to_forward = filter_logs(
[json.dumps(log, ensure_ascii=False) for log in logs],
include_pattern=INCLUDE_AT_MATCH,
exclude_pattern=EXCLUDE_AT_MATCH,
)
scrubber = DatadogScrubber(SCRUBBING_RULE_CONFIGS)
if DD_USE_TCP:
batcher = DatadogBatcher(256 * 1000, 256 * 1000, 1)
cli = DatadogTCPClient(DD_URL, DD_PORT, DD_NO_SSL, DD_API_KEY, scrubber)
else:
batcher = DatadogBatcher(512 * 1000, 4 * 1000 * 1000, 400)
cli = DatadogHTTPClient(
DD_URL, DD_PORT, DD_NO_SSL, DD_SKIP_SSL_VALIDATION, DD_API_KEY, scrubber
class Forwarder(object):
def __init__(self, function_prefix):
self.trace_connection = TraceConnection(
DD_TRACE_INTAKE_URL, DD_API_KEY, DD_SKIP_SSL_VALIDATION
)
self.storage = Storage(function_prefix)

with DatadogClient(cli) as client:
try:
def forward(self, logs, metrics, traces):
"""
Forward logs, metrics, and traces to Datadog in a background thread.
"""
if DD_FORWARD_LOG:
self._forward_logs(logs)
self._forward_metrics(metrics)
self._forward_traces(traces)

def retry(self):
"""
Retry forwarding logs, metrics, and traces to Datadog.
"""
for prefix in RetryPrefix:
self._retry_prefix(prefix)

def _retry_prefix(self, prefix):
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Retrying {prefix} data")

key_data = self.storage.get_data(prefix)

for k, d in key_data.items():
if d is None:
continue
match prefix:
case RetryPrefix.LOGS:
self._forward_logs(d, key=k)
case RetryPrefix.METRICS:
self._forward_metrics(d, key=k)
case RetryPrefix.TRACES:
self._forward_traces(d, key=k)

def _forward_logs(self, logs, key=None):
"""Forward logs to Datadog"""
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarding {len(logs)} logs")

logs_to_forward = []
for log in logs:
if key:
log = add_retry_tag(log)
logs_to_forward.append(json.dumps(log, ensure_ascii=False))

logs_to_forward = filter_logs(
logs_to_forward, INCLUDE_AT_MATCH, EXCLUDE_AT_MATCH
)

scrubber = DatadogScrubber(SCRUBBING_RULE_CONFIGS)
if DD_USE_TCP:
batcher = DatadogBatcher(256 * 1000, 256 * 1000, 1)
cli = DatadogTCPClient(DD_URL, DD_PORT, DD_NO_SSL, DD_API_KEY, scrubber)
else:
batcher = DatadogBatcher(512 * 1000, 4 * 1000 * 1000, 400)
cli = DatadogHTTPClient(
DD_URL, DD_PORT, DD_NO_SSL, DD_SKIP_SSL_VALIDATION, DD_API_KEY, scrubber
)

failed_logs = []
with DatadogClient(cli) as client:
for batch in batcher.batch(logs_to_forward):
client.send(batch)
try:
client.send(batch)
except Exception:
logger.exception(f"Exception while forwarding log batch {batch}")
failed_logs.extend(batch)
else:
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarded log batch: {batch}")
if key:
self.storage.delete_data(key)

if DD_STORE_FAILED_EVENTS and len(failed_logs) > 0 and not key:
self.storage.store_data(RetryPrefix.LOGS, failed_logs)

send_event_metric("logs_forwarded", len(logs_to_forward) - len(failed_logs))

def _forward_metrics(self, metrics, key=None):
"""
Forward custom metrics submitted via logs to Datadog in a background thread
using `lambda_stats` that is provided by the Datadog Python Lambda Layer.
"""
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarding {len(metrics)} metrics")

failed_metrics = []
for metric in metrics:
try:
send_log_metric(metric)
except Exception:
logger.exception(
f"Exception while forwarding metric {json.dumps(metric)}"
)
failed_metrics.append(metric)
else:
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarded log batch: {json.dumps(batch)}")
logger.debug(f"Forwarded metric: {json.dumps(metric)}")
if key:
self.storage.delete_data(key)

if DD_STORE_FAILED_EVENTS and len(failed_metrics) > 0 and not key:
self.storage.store_data(RetryPrefix.METRICS, failed_metrics)

send_event_metric("metrics_forwarded", len(metrics) - len(failed_metrics))

def _forward_traces(self, traces, key=None):
if not len(traces) > 0:
return

if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarding {len(traces)} traces")

try:
serialized_trace_paylods = json.dumps(traces)
self.trace_connection.send_traces(serialized_trace_paylods)
except Exception:
logger.exception(
f"Exception while forwarding log batch {json.dumps(batch)}"
f"Exception while forwarding traces {serialized_trace_paylods}"
)
if DD_STORE_FAILED_EVENTS and not key:
self.storage.store_data(RetryPrefix.TRACES, traces)
else:
send_event_metric("logs_forwarded", len(logs_to_forward))


def _forward_metrics(metrics):
"""
Forward custom metrics submitted via logs to Datadog in a background thread
using `lambda_stats` that is provided by the Datadog Python Lambda Layer.
"""
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarding {len(metrics)} metrics")
try:
for metric in metrics:
send_log_metric(metric)
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarded metric: {json.dumps(metric)}")
except Exception:
logger.exception(f"Exception while forwarding metric {json.dumps(metric)}")
else:
send_event_metric("metrics_forwarded", len(metrics))


def _forward_traces(trace_payloads):
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarding {len(trace_payloads)} traces")
try:
trace_connection.send_traces(trace_payloads)
except Exception:
logger.exception(
f"Exception while forwarding traces {json.dumps(trace_payloads)}"
)
else:
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Forwarded traces: {json.dumps(trace_payloads)}")
send_event_metric("traces_forwarded", len(trace_payloads))
logger.debug(f"Forwarded traces: {serialized_trace_paylods}")
if key:
self.storage.delete_data(key)
send_event_metric("traces_forwarded", len(traces))
38 changes: 30 additions & 8 deletions aws/logs_monitoring/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
from steps.transformation import transform
from steps.splitting import split
from caching.cache_layer import CacheLayer
from forwarder import forward
from forwarder import Forwarder
from settings import (
DD_API_KEY,
DD_SKIP_SSL_VALIDATION,
DD_API_URL,
DD_FORWARDER_VERSION,
DD_ADDITIONAL_TARGET_LAMBDAS,
DD_RETRY_KEYWORD,
)


Expand Down Expand Up @@ -55,6 +56,7 @@
api._cacert = not DD_SKIP_SSL_VALIDATION

cache_layer = None
forwarder = None


def datadog_forwarder(event, context):
Expand All @@ -66,31 +68,51 @@ def datadog_forwarder(event, context):
if DD_ADDITIONAL_TARGET_LAMBDAS:
invoke_additional_target_lambdas(event)

init_cache_layer(context)
function_prefix = get_function_arn_digest(context)
init_cache_layer(function_prefix)
init_forwarder(function_prefix)

parsed = parse(event, context, cache_layer)
enriched = enrich(parsed, cache_layer)
transformed = transform(enriched)
metrics, logs, trace_payloads = split(transformed)

forward(logs, metrics, trace_payloads)
forwarder.forward(logs, metrics, trace_payloads)
parse_and_submit_enhanced_metrics(logs, cache_layer)

try:
if bool(event.get(DD_RETRY_KEYWORD, False)) is True:
forwarder.retry()
except Exception as e:
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"Failed to retry forwarding {e}")
pass

def init_cache_layer(context):

def init_cache_layer(function_prefix):
global cache_layer
if cache_layer is None:
# set the prefix for cache layer
try:
if not cache_layer:
function_arn = context.invoked_function_arn.lower()
prefix = sha1(function_arn.encode("UTF-8")).hexdigest()
cache_layer = CacheLayer(prefix)
if cache_layer is None:
cache_layer = CacheLayer(function_prefix)
except Exception as e:
logger.exception(f"Failed to create cache layer due to {e}")
raise


def init_forwarder(function_prefix):
global forwarder
if forwarder is None:
forwarder = Forwarder(function_prefix)


def get_function_arn_digest(context):
function_arn = context.invoked_function_arn.lower()
prefix = sha1(function_arn.encode("UTF-8")).hexdigest()
return prefix


def invoke_additional_target_lambdas(event):
lambda_client = boto3.client("lambda")
lambda_arns = DD_ADDITIONAL_TARGET_LAMBDAS.split(",")
Expand Down
13 changes: 13 additions & 0 deletions aws/logs_monitoring/logs/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
import re
import gzip
import os
import json
from logs.exceptions import ScrubbingException

from settings import DD_CUSTOM_TAGS, DD_RETRY_KEYWORD

logger = logging.getLogger()
logger.setLevel(logging.getLevelName(os.environ.get("DD_LOG_LEVEL", "INFO").upper()))

Expand Down Expand Up @@ -75,3 +78,13 @@ def compileRegex(rule, pattern):
raise Exception(
"could not compile {} regex with pattern: {}".format(rule, pattern)
)


def add_retry_tag(log):
try:
log = json.loads(log)
log[DD_CUSTOM_TAGS] = log.get(DD_CUSTOM_TAGS, "") + f",{DD_RETRY_KEYWORD}:true"
except Exception:
logger.warning(f"cannot add retry tag for log {log}")

return log
10 changes: 10 additions & 0 deletions aws/logs_monitoring/retry/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from enum import Enum


class RetryPrefix(Enum):
LOGS = "logs"
METRICS = "metrics"
TRACES = "traces"

def __str__(self):
return self.value
Loading

0 comments on commit 23886fb

Please sign in to comment.