-
Notifications
You must be signed in to change notification settings - Fork 77
/
function.py
595 lines (479 loc) · 18.8 KB
/
function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
"""
Copyright 2017 New Relic, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This Lambda function receives log entries from CloudWatch Logs
and pushes them to New Relic Infrastructure - Cloud integrations.
It expects to be invoked based on CloudWatch Logs streams.
New Relic's license key must be encrypted using KMS following these
instructions:
1. After creating the Lambda based on the Blueprint, select it and open the
Environment Variables section.
2. Check that the "LICENSE_KEY" environment variable if properly filled-in.
3. If you changed anything, go to the start of the page and press "Save".
Logs should start to be processed by the Lambda. To check if everything is
functioning properly you can check the Monitoring tab and CloudWatch Logs.
For more detailed documentation, check New Relic's documentation site:
https://docs.newrelic.com/
"""
import datetime
import gzip
import json
import logging
import os
import re
import time
from base64 import b64decode
from enum import Enum
from urllib import request
import aiohttp
import asyncio
logger = logging.getLogger()
try:
import newrelic.agent
except ImportError:
pass
else:
# The agent shouldn't be running on this function. Ensure it is shutdown.
newrelic.agent.shutdown_agent()
# Retrying configuration.
# Increasing these numbers will make the function longer in case of
# communication failures and that will increase the cost.
# Decreasing these number could increase the probability of data loss.
# Upon receiving the following error codes, the request will be retried up to MAX_RETRIES times
RETRYABLE_ERROR_CODES = [408, 429]
# Maximum number of retries
MAX_RETRIES = 3
# Initial backoff (in seconds) between retries
INITIAL_BACKOFF = 1
# Multiplier factor for the backoff between retries
BACKOFF_MULTIPLIER = 2
# Max length in bytes of the payload
MAX_PAYLOAD_SIZE = 1000 * 1024
# Individual request timeout in seconds (non-configurable)
INDIVIDUAL_REQUEST_TIMEOUT_DURATION = 3
INDIVIDUAL_REQUEST_TIMEOUT = aiohttp.ClientTimeout(
total=INDIVIDUAL_REQUEST_TIMEOUT_DURATION
)
# Session max processing time (non-configurable).
# Reserves a time buffer for logs to be formatted before being sent.
SESSION_MAX_PROCESSING_TIME = 1
LAMBDA_LOG_GROUP_PREFIX = os.getenv("NR_LAMBDA_LOG_GROUP_PREFIX", "/aws/lambda")
VPC_LOG_GROUP_PREFIX = os.getenv("NR_VPC_LOG_GROUP_PREFIX", "/aws/vpc/flow-logs")
LAMBDA_NR_MONITORING_PATTERN = re.compile(r'.*"NR_LAMBDA_MONITORING')
REPORT_PATTERN = re.compile("REPORT RequestId:")
TIMEOUT_PATTERN = re.compile(
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d+Z\s[\d\w-]+\sTask timed out after [\d.]+ seconds"
)
# Lines like this are generated by the Lambda service when it has to kill the function's runtime,
# e.g. for an out of memory error.
REQUEST_ID_PATTERN = re.compile(r"RequestId:\s([-a-zA-Z0-9]{36})\s(.*)", re.DOTALL)
class EntryType(Enum):
VPC = "vpc"
LAMBDA = "lambda"
OTHER = "other"
INGEST_SERVICE_VERSION = "v1"
US_LOGGING_ENDPOINT = "https://log-api.newrelic.com/log/v1"
EU_LOGGING_ENDPOINT = "https://log-api.eu.newrelic.com/log/v1"
US_INFRA_ENDPOINT = "https://cloud-collector.newrelic.com"
EU_INFRA_ENDPOINT = "https://cloud-collector.eu01.nr-data.net"
INFRA_INGEST_SERVICE_PATHS = {
EntryType.LAMBDA: "/aws/lambda",
EntryType.VPC: "/aws/vpc",
EntryType.OTHER: "/aws",
}
LAMBDA_REQUEST_ID_REGEX = re.compile(
r"RequestId:\s"
r"(?P<request_id>[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})"
)
LOGGING_LAMBDA_VERSION = "1.0.3"
LOGGING_PLUGIN_METADATA = {"type": "lambda", "version": LOGGING_LAMBDA_VERSION}
class MaxRetriesException(Exception):
pass
class BadRequestException(Exception):
pass
async def http_post(session, url, data, headers):
def _format_error(e, text):
return "{}. {}".format(e, text)
backoff = INITIAL_BACKOFF
retries = 0
while retries < MAX_RETRIES:
if retries > 0:
logger.info("Retrying in {} seconds".format(backoff))
await asyncio.sleep(backoff)
backoff *= BACKOFF_MULTIPLIER
retries += 1
try:
resp = await session.post(
url, data=data, headers=headers, timeout=INDIVIDUAL_REQUEST_TIMEOUT
)
resp.raise_for_status()
return resp.status, resp.url
except aiohttp.ClientResponseError as e:
if e.status == 400:
raise BadRequestException(_format_error(e, "Unexpected payload"))
elif e.status == 403:
raise BadRequestException(_format_error(e, "Review your license key"))
elif e.status == 404:
raise BadRequestException(
_format_error(e, "Review the region endpoint")
)
elif e.status in RETRYABLE_ERROR_CODES:
logger.warning(f"There was a {e.status} error. Reason: {e.message}")
# Now retry the request
continue
elif 400 <= e.status < 500:
raise BadRequestException(e)
except asyncio.TimeoutError:
logger.warning(f"Timeout on {url} at attempt {retries}/{MAX_RETRIES}")
# Now retry the request
continue
raise MaxRetriesException()
def _filter_log_lines(log_entry):
"""
The EntryType.LAMBDA check guarantees that we'll be left with at least one log after filtering
"""
final_log_events = []
for event in log_entry["logEvents"]:
message = event["message"]
if REPORT_PATTERN.match(message) or _is_lambda_message(message):
final_log_events.append(event)
ret = log_entry.copy()
ret["logEvents"] = final_log_events
return ret
def _calculate_session_timeout():
# Request 0
total = INDIVIDUAL_REQUEST_TIMEOUT_DURATION
# Requests 1 to N-1
backoff = INITIAL_BACKOFF
for retry in range(MAX_RETRIES - 1):
total += backoff + INDIVIDUAL_REQUEST_TIMEOUT_DURATION
backoff *= BACKOFF_MULTIPLIER
# Finally, add maximum worst-case-scenario expected processing time
return total + SESSION_MAX_PROCESSING_TIME
async def _send_log_entry(log_entry, context):
"""
This function sends the log entry to New Relic Infrastructure's ingest
server. If it is necessary, entries will be split in different payloads
Log entry is sent along with the Lambda function's execution context
"""
entry_type = _get_entry_type(log_entry)
context = {
"function_name": context.function_name,
"invoked_function_arn": context.invoked_function_arn,
"log_group_name": context.log_group_name,
"log_stream_name": context.log_stream_name,
}
session_timeout = _calculate_session_timeout()
async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=session_timeout), trust_env=True
) as session:
# Both Infrastructure and Logging require a "LICENSE_KEY" environment variable.
# In order to send data to the Infrastructure Pipeline, the customer doesn't need
# to do anything. To disable it, they'll set "INFRA_ENABLED" to "false".
# To send data to the Logging Pipeline, an environment variable called "LOGGING_ENABLED"
# is required and needs to be set to "true". To disable it, they don't need to do anything,
# it is disabled by default
# Instruction for how to find these keys are in the README.md
requests = []
if _infra_enabled():
if entry_type == EntryType.LAMBDA:
# If this is one of our lambda entries, we should only send the log lines we
# actually care about
data = {
"context": context,
"entry": json.dumps(_filter_log_lines(log_entry)),
}
else:
# VPC logs are infra requests that aren't Lambda invocations
data = {"context": context, "entry": json.dumps(log_entry)}
for payload in _generate_payloads(data, _split_infra_payload):
requests.append(
_send_payload(
_get_infra_request_creator(entry_type, payload), session, True
)
)
if _logging_enabled():
data = {"context": context, "entry": json.dumps(log_entry)}
for payload in _generate_payloads(
_package_log_payload(data), _split_log_payload
):
requests.append(
_send_payload(_get_logging_request_creator(payload), session)
)
logger.debug("Sending data to New Relic.....")
ini = time.perf_counter()
result = await asyncio.gather(*requests)
elapsed_millis = (time.perf_counter() - ini) * 1000
logger.debug(f"Time elapsed to send to New Relic: {elapsed_millis:0.2f}ms")
return result
async def _send_payload(request_creator, session, retry=False):
try:
req = request_creator()
status, url = await http_post(
session, req.get_full_url(), req.data, req.headers
)
except MaxRetriesException as e:
logger.error("Retry limit reached. Failed to send log entry.")
if retry:
raise e
except BadRequestException as e:
logger.error(e)
except asyncio.TimeoutError as e:
logger.error("Session timed out. Failed to send log entry.")
raise e
except Exception as e:
logger.error(f"Error occurred: {e}")
raise e
else:
logger.info("Log entry sent. Response code: {}. url: {}".format(status, url))
return status
def _generate_payloads(data, split_function):
"""
Return a list of payloads to be sent to New Relic.
This method usually returns a list of one element, but can be bigger if the
payload size is too big
"""
payload = gzip.compress(json.dumps(data).encode())
if len(payload) < MAX_PAYLOAD_SIZE:
return [payload]
split_data = split_function(data)
return _generate_payloads(split_data[0], split_function) + _generate_payloads(
split_data[1], split_function
)
def _get_license_key(license_key=None):
"""
This functions gets New Relic's license key from env vars.
"""
if license_key:
return license_key
return os.getenv("LICENSE_KEY", "")
def _get_newrelic_tags(payload):
"""
This functions gets New Relic's tags from env vars and adds it to the payload
A tag is a key value pair. Multiple tags can be specified.
Key and value are colon delimited. Multiple key value pairs are semi-colon delimited.
e.g. env:prod;team:myTeam
"""
nr_tags_str = os.getenv("NR_TAGS", "")
nr_delimiter = os.getenv("NR_ENV_DELIMITER", ";")
if nr_tags_str:
nr_tags = dict(
item.split(":")
for item in nr_tags_str.split(nr_delimiter)
if not item.startswith(tuple(["aws:", "plugin:"]))
)
payload[0]["common"]["attributes"].update(nr_tags)
def _debug_logging_enabled():
"""
Determines whether or not debug logging should be enabled based on the env var.
Defaults to false.
"""
return os.getenv("DEBUG_LOGGING_ENABLED", "false").lower() == "true"
##############
# NR Infra #
##############
def _infra_enabled():
"""
This function returns whether to send info to New Relic Infrastructure.
Enabled by default.
"""
return os.getenv("INFRA_ENABLED", "true").lower() == "true"
def _get_infra_request_creator(entry_type, payload, ingest_host=None, license_key=None):
def create_request():
req = request.Request(_get_infra_url(entry_type, ingest_host), payload)
req.add_header("X-License-Key", _get_license_key(license_key))
req.add_header("Content-Encoding", "gzip")
return req
return create_request
def _get_infra_url(entry_type, ingest_host=None):
"""
Returns the ingest_service_url.
This is a concatenation of the HOST + PATH + VERSION
"""
if ingest_host is None:
ingest_host = _get_infra_endpoint()
path = INFRA_INGEST_SERVICE_PATHS[entry_type]
return ingest_host + path + "/" + INGEST_SERVICE_VERSION
def _is_lambda_message(message):
"""
Matches messages that are sufficient to report a Lambda invocation.
REPORT lines are not sufficient, just nice to have.
"""
return (
LAMBDA_NR_MONITORING_PATTERN.match(message)
or TIMEOUT_PATTERN.match(message)
or REQUEST_ID_PATTERN.match(message)
)
def _get_entry_type(log_entry):
"""
Returns the EntryType of the entry based on some text found in its value.
"""
log_group = log_entry["logGroup"]
if log_group.startswith(VPC_LOG_GROUP_PREFIX):
return EntryType.VPC
elif log_group.startswith(LAMBDA_LOG_GROUP_PREFIX) and any(
_is_lambda_message(event["message"]) for event in log_entry["logEvents"]
):
return EntryType.LAMBDA
return EntryType.OTHER
def _get_infra_endpoint():
"""
Service url is determined by the license key's region.
Any other URL could be passed by using the NR_INFRA_ENDPOINT env var.
"""
if "NR_INFRA_ENDPOINT" in os.environ:
return os.environ["NR_INFRA_ENDPOINT"]
return (
EU_INFRA_ENDPOINT if _get_license_key().startswith("eu") else US_INFRA_ENDPOINT
)
def _split_infra_payload(data):
"""
When data size is bigger than supported payload, it is divided in two
different requests
"""
context = data["context"]
entry = json.loads(data["entry"])
logEvents = entry["logEvents"]
half = len(logEvents) // 2
return [
_reconstruct_infra_data(context, entry, logEvents[:half]),
_reconstruct_infra_data(context, entry, logEvents[half:]),
]
def _reconstruct_infra_data(context, entry, logEvents):
entry["logEvents"] = logEvents
return {"context": context, "entry": json.dumps(entry)}
################
# NR Logging #
################
def _logging_enabled():
"""
This function returns whether to send info to New Relic Logging.
Disabled by default.
"""
return os.getenv("LOGGING_ENABLED", "false").lower() == "true"
def _get_logging_request_creator(payload, ingest_url=None, license_key=None):
def create_request():
req = request.Request(_get_logging_endpoint(ingest_url), payload)
req.add_header("X-License-Key", _get_license_key(license_key))
req.add_header("X-Event-Source", "logs")
req.add_header("Content-Encoding", "gzip")
return req
return create_request
def _set_console_logging_level():
"""
Determines whether or not debug logging should be enabled based on the env var.
Defaults to false.
"""
if _debug_logging_enabled():
logger.setLevel(logging.DEBUG)
logger.debug("Enabled debug mode")
else:
logger.setLevel(logging.INFO)
def _get_logging_endpoint(ingest_url=None):
"""
Service url is determined by the lincese key's region.
Any other URL could be passed by using the NR_LOGGING_ENDPOINT env var.
"""
if ingest_url:
return ingest_url
if "NR_LOGGING_ENDPOINT" in os.environ:
return os.environ["NR_LOGGING_ENDPOINT"]
return (
EU_LOGGING_ENDPOINT
if _get_license_key().startswith("eu")
else US_LOGGING_ENDPOINT
)
def _package_log_payload(data):
"""
Packages up a MELT request for log messages
"""
entry = json.loads(data["entry"])
log_events = entry["logEvents"]
log_messages = []
lambda_request_id = None
for log_event in log_events:
log_message = {
"message": log_event["message"],
"timestamp": log_event["timestamp"],
"attributes": {"aws": {}},
}
for event_key in log_event:
if event_key not in ("id", "message", "timestamp"):
log_message["attributes"][event_key] = log_event[event_key]
if entry["logGroup"].startswith(LAMBDA_LOG_GROUP_PREFIX):
match = LAMBDA_REQUEST_ID_REGEX.search(log_event["message"])
if match and match.group("request_id"):
lambda_request_id = match.group("request_id")
if lambda_request_id:
log_message["attributes"]["aws"][
"lambda_request_id"
] = lambda_request_id
log_messages.append(log_message)
packaged_payload = [
{
"common": {
"attributes": {
"plugin": LOGGING_PLUGIN_METADATA,
"aws": {
"logStream": entry["logStream"],
"logGroup": entry["logGroup"],
},
}
},
"logs": log_messages,
}
]
_get_newrelic_tags(packaged_payload)
return packaged_payload
def _split_log_payload(payload):
"""
When data size is bigger than supported payload, it is divided in two
different requests
"""
common = payload[0]["common"]
logs = payload[0]["logs"]
half = len(logs) // 2
return [
_reconstruct_log_payload(common, logs[:half]),
_reconstruct_log_payload(common, logs[half:]),
]
def _reconstruct_log_payload(common, logs):
return [{"common": common, "logs": logs}]
####################
# Lambda handler #
####################
def lambda_handler(event, context):
"""
This is the Lambda handler, which is called when the function is invoked.
Changing the name of this function will require changes in Lambda
function's configuration.
"""
_set_console_logging_level()
# CloudWatch Log entries are compressed and encoded in Base64
event_data = b64decode(event["awslogs"]["data"])
log_entry_str = gzip.decompress(event_data).decode("utf-8")
log_entry = json.loads(log_entry_str)
# output additional helpful info if debug logging is enabled
# not enabled by default since parsing into json might be slow
# calling '[0]' without a safety check looks sketchy, but Cloudwatch is never going
# to send us a log without at least one event
logger.debug(
"logGroup: {}, logStream: {}, timestamp: {}".format(
log_entry["logGroup"],
log_entry["logStream"],
datetime.datetime.fromtimestamp(
log_entry["logEvents"][0]["timestamp"] / 1000.0
),
)
)
asyncio.run(_send_log_entry(log_entry, context))
# This makes it possible to chain this CW log consumer with others using a success destination
return event