From 5a981d952f181dbc18696c72ca84a9e1c6b18e3a Mon Sep 17 00:00:00 2001 From: MarkNjunge Date: Mon, 22 Apr 2024 21:27:38 +0300 Subject: [PATCH] Add OpenTelemetry --- README.md | 44 +- config/custom-environment-variables.json | 7 + config/default.json | 7 + observability/README.MD | 44 + observability/config/datasources.yml | 37 + observability/config/grafana.ini | 1580 ++++++++ observability/config/loki-config.yaml | 35 + .../config/otel-collector-config.yaml | 42 + observability/config/prometheus.yaml | 12 + observability/config/tempo.yaml | 59 + observability/docker-compose.yml | 62 + package-lock.json | 3572 +++++++++++++++-- package.json | 8 + src/config/index.ts | 7 + src/decorators/request-context.decorator.ts | 9 +- src/filters/all-exceptions-filter.ts | 8 +- src/interceptors/global.interceptor.ts | 4 +- src/logging/Logger.ts | 31 +- src/logging/Sample.transport.ts | 4 +- src/logging/loki.transport.ts | 108 + src/main.ts | 3 +- src/middleware/global.middleware.ts | 12 +- src/models/_shared/ApiError.dto.ts | 2 +- src/modules/app/app.service.ts | 7 +- src/modules/users/users.service.ts | 19 +- src/utils/instrumentation.ts | 72 + 26 files changed, 5489 insertions(+), 306 deletions(-) create mode 100644 observability/README.MD create mode 100644 observability/config/datasources.yml create mode 100644 observability/config/grafana.ini create mode 100644 observability/config/loki-config.yaml create mode 100644 observability/config/otel-collector-config.yaml create mode 100644 observability/config/prometheus.yaml create mode 100644 observability/config/tempo.yaml create mode 100644 observability/docker-compose.yml create mode 100644 src/logging/loki.transport.ts create mode 100644 src/utils/instrumentation.ts diff --git a/README.md b/README.md index d40b1e9..1ab7bb6 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,13 @@ A boilerplate for [NestJS](https://nestjs.com/), using Fastify. - [Query Parsing](#query-parsing) - [File Upload](#file-upload) - [Logging](#logging) +- [Request Context](#request-context) - [Auth guard](#auth-guard) - [Rate limiting](#rate-limiting) - [Request body validation](#request-body-validation) - [Exception Handling](#errors--exception-handling) -- [Docker support](#docker) +- [OpenTelemetry](#opentelemetry) +- [Docker](#docker) - [Testing](#testing) - [Continuous Integration](#ci) @@ -211,7 +213,7 @@ this.logger.debug("Hello!", "AppService.getHello", { data: { user: "mark" } }); To log to other locations, a [custom transport](https://github.com/winstonjs/winston-transport) is needed. See [SampleTransport](src/logging/Sample.transport.ts) for an example. -#### Redact Private Keys +### Redact Private Keys Private keys are automatically redacted in logged data. @@ -226,16 +228,20 @@ The private keys are specified in [redact.ts](src/utils/redact.ts) } ``` +### Grafana Loki + +A log transport to Grafana Loki is implemented for observability. See [OpenTelemetry](#opentelemetry). + ## Request Context Request context. be accessed using the `@ReqCtx()` header. -It contains a `correlationId`. +It contains a `traceId`. ```typescript @Get() function getHello(@ReqCtx() ctx: IReqCtx) { - console.log(ctx.correlationId) // c855677c64c654d1 + console.log(ctx.traceId) // 0d8df9931b05fbcd2262bc696a1410a6 } ``` @@ -328,7 +334,7 @@ throw new HttpException(404, `User ${1} was not found`, ErrorCodes.INVALID_USER, "status": 404, "message": "User 1 was not found", "code": "InvalidUser", - "correlationId": "775523bae019485d", + "traceId": "775523bae019485d", "meta": { "id": 1 } @@ -342,10 +348,36 @@ Regular errors an unhandled exceptions are also caught and returned as a 500 res "status": 500, "message": "Uncaught exception", "code": "InternalError", - "correlationId": "d3cb1b2b3388e3b1" + "traceId": "d3cb1b2b3388e3b1" } ``` +## OpenTelemetry + +[OpenTelemetry](https://opentelemetry.io/docs/languages/js/) support in included with support for traces and metrics. + +See [instrumentation.ts](./src/utils/instrumentation.ts) for config. + +See [Observability directory](./observability/README.MD) for a compose file with various services for collecting and viewing signals. + +> Logs is not yet supported in the SDK, so a log transport to Grafana Loki is present to fill the gap. + +### Traces + +Automatic instrumentation is enabled and will suite most needs. +Custom spans can be created as described in the [OpenTelemetry docs](https://opentelemetry.io/docs/languages/js/instrumentation/#create-spans). + +### Metrics + +[See OpenTelemetry docs](https://opentelemetry.io/docs/languages/js/instrumentation/#metrics) + +```typescript +const meter = opentelemetry.metrics.getMeter("UserService"); +const getUserCounter = this.meter.createCounter("get_user") +getUserCounter.add(1, { user_id: id }); +``` + + ## Docker The application can be run using docker. diff --git a/config/custom-environment-variables.json b/config/custom-environment-variables.json index 67567d3..687d5d1 100644 --- a/config/custom-environment-variables.json +++ b/config/custom-environment-variables.json @@ -25,5 +25,12 @@ "logging": { "timestampFormat": "LOGGING_TIMESTAMP_FORMAT", "sensitiveParams": "LOGGING_SENSITIVE_PARAMS" + }, + "instrumentation": { + "enabled": "INSTRUMENTATION_ENABLED", + "debug": "INSTRUMENTATION_DEBUG", + "traceUrl": "INSTRUMENTATION_TRACE_URL", + "metricsUrl": "INSTRUMENTATION_METRICS_URL", + "lokiHost": "INSTRUMENTATION_LOKI_HOST" } } diff --git a/config/default.json b/config/default.json index 233c8ab..470a833 100644 --- a/config/default.json +++ b/config/default.json @@ -37,5 +37,12 @@ "fileUpload": { "maxSize": 5242880, "removeAfterUpload": true + }, + "instrumentation": { + "enabled": false, + "debug": false, + "traceUrl": "http://localhost:4318/v1/traces", + "metricsUrl": "http://localhost:4318/v1/metrics", + "lokiHost": "http://127.0.0.1:9100" } } diff --git a/observability/README.MD b/observability/README.MD new file mode 100644 index 0000000..2647e5e --- /dev/null +++ b/observability/README.MD @@ -0,0 +1,44 @@ +# Observability + +OpenTelemetry observability stack with Grafana +- OpenTelemetry Collector - Receive telemetry data from SDK +- Grafana - Dashboard +- Grafana Loki - Logs +- Grafana Tempo - Traces +- Prometheus - Metrics + +## Signal Paths +Traces: `App -> Collector -> Temp -> Grafana` + +Metrics: `App -> Collector -> Prometheus -> Grafana` + +Logs: `App -> Logger -> Loki -> Grafana` + + +## Usage + +Start the service +``` +docker compose up -d +``` + +Open Grafana +http://localhost:9000 +``` +username: admin +password: admin +``` + +Data sources for Loki, Tempo and Prometheus are auto-configured. + +Tempo is set up be able to navigate to Loki logs using a trace id. + +## Endpoints +| Service | Host | +|-------------------|:------------------------------------| +| Grafana Dashboard | http://localhost:9000 | +| OTel Collector | http://localhost:55679/debug/tracez | +| Prometheues | http://localhost:9090 | +| App Metrics | http://localhost:8889/metrics | +| Loki | http://localhost:9100 | + diff --git a/observability/config/datasources.yml b/observability/config/datasources.yml new file mode 100644 index 0000000..2a460ef --- /dev/null +++ b/observability/config/datasources.yml @@ -0,0 +1,37 @@ +apiVersion: 1 +datasources: + - name: loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + basicAuth: false + editable: true + isDefault: true + version: 1 + + - name: prometheus + uid: prometheus + type: prometheus + url: http://prometheus:9090 + access: proxy + basicAuth: false + editable: true + isDefault: false + version: 1 + + - name: tempo + uid: tempo + type: tempo + url: http://tempo:3200 + access: proxy + basicAuth: false + editable: true + isDefault: false + jsonData: + tracesToLogsV2: + customQuery: true + datasourceUid: loki + filterByTraceID: true + query: '{trace_id="${__trace.traceId}"}' + version: 1 diff --git a/observability/config/grafana.ini b/observability/config/grafana.ini new file mode 100644 index 0000000..ddc55de --- /dev/null +++ b/observability/config/grafana.ini @@ -0,0 +1,1580 @@ +##################### Grafana Configuration Example ##################### +# +# Everything has defaults so you only need to uncomment things you want to +# change + +# possible values : production, development +;app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +;instance_name = ${HOSTNAME} + +# force migration will run migrations that might cause dataloss +;force_migration = false + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +;data = /var/lib/grafana + +# Temporary files in `data` directory older than given duration will be removed +;temp_data_lifetime = 24h + +# Directory where grafana can store logs +;logs = /var/log/grafana + +# Directory where grafana will automatically scan and look for plugins +;plugins = /var/lib/grafana/plugins + +# folder that contains provisioning config files that grafana will apply on startup and while running. +;provisioning = conf/provisioning + +#################################### Server #################################### +[server] +# Protocol (http, https, h2, socket) +;protocol = http + +# This is the minimum TLS version allowed. By default, this value is empty. Accepted values are: TLS1.2, TLS1.3. If nothing is set TLS1.2 would be taken +;min_tls_version = "" + +# The ip address to bind to, empty will bind to all interfaces +;http_addr = + +# The http port to use +http_port = 3000 + +# The public facing domain name used to access grafana from a browser +;domain = localhost + +# Redirect to correct domain if host header does not match domain +# Prevents DNS rebinding attacks +;enforce_domain = false + +# The full public facing url you use in browser, used for redirects and emails +# If you use reverse proxy and sub path specify full url (with sub path) +;root_url = %(protocol)s://%(domain)s:%(http_port)s/ + +# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. +;serve_from_sub_path = false + +# Log web requests +;router_logging = false + +# the path relative working path +;static_root_path = public + +# enable gzip +;enable_gzip = false + +# https certs & key file +;cert_file = +;cert_key = + +# Unix socket gid +# Changing the gid of a file without privileges requires that the target group is in the group of the process and that the process is the file owner +# It is recommended to set the gid as http server user gid +# Not set when the value is -1 +;socket_gid = + +# Unix socket mode +;socket_mode = + +# Unix socket path +;socket = + +# CDN Url +;cdn_url = + +# Sets the maximum time using a duration format (5s/5m/5ms) before timing out read of an incoming request and closing idle connections. +# `0` means there is no timeout for reading the request. +;read_timeout = 0 + +# This setting enables you to specify additional headers that the server adds to HTTP(S) responses. +[server.custom_response_headers] +#exampleHeader1 = exampleValue1 +#exampleHeader2 = exampleValue2 + +#################################### GRPC Server ######################### +;[grpc_server] +;network = "tcp" +;address = "127.0.0.1:10000" +;use_tls = false +;cert_file = +;key_file = + +#################################### Database #################################### +[database] +# You can configure the database connection by specifying type, host, name, user and password +# as separate properties or as on string using the url properties. + +# Either "mysql", "postgres" or "sqlite3", it's your choice +;type = sqlite3 +;host = 127.0.0.1:3306 +;name = grafana +;user = root +# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" +;password = + +# Use either URL or the previous fields to configure the database +# Example: mysql://user:secret@host:port/database +;url = + +# For "postgres", use either "disable", "require" or "verify-full" +# For "mysql", use either "true", "false", or "skip-verify". +;ssl_mode = disable + +# Database drivers may support different transaction isolation levels. +# Currently, only "mysql" driver supports isolation levels. +# If the value is empty - driver's default isolation level is applied. +# For "mysql" use "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ" or "SERIALIZABLE". +;isolation_level = + +;ca_cert_path = +;client_key_path = +;client_cert_path = +;server_cert_name = + +# For "sqlite3" only, path relative to data_path setting +;path = grafana.db + +# Max idle conn setting default is 2 +;max_idle_conn = 2 + +# Max conn setting default is 0 (mean not set) +;max_open_conn = + +# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) +;conn_max_lifetime = 14400 + +# Set to true to log the sql calls and execution times. +;log_queries = + +# For "sqlite3" only. cache mode setting used for connecting to the database. (private, shared) +;cache_mode = private + +# For "sqlite3" only. Enable/disable Write-Ahead Logging, https://sqlite.org/wal.html. Default is false. +;wal = false + +# For "mysql" only if migrationLocking feature toggle is set. How many seconds to wait before failing to lock the database for the migrations, default is 0. +;locking_attempt_timeout_sec = 0 + +# For "sqlite" only. How many times to retry query in case of database is locked failures. Default is 0 (disabled). +;query_retries = 0 + +# For "sqlite" only. How many times to retry transaction in case of database is locked failures. Default is 5. +;transaction_retries = 5 + +# Set to true to add metrics and tracing for database queries. +;instrument_queries = false + +################################### Data sources ######################### +[datasources] +# Upper limit of data sources that Grafana will return. This limit is a temporary configuration and it will be deprecated when pagination will be introduced on the list data sources API. +;datasource_limit = 5000 + +#################################### Cache server ############################# +[remote_cache] +# Either "redis", "memcached" or "database" default is "database" +;type = database + +# cache connectionstring options +# database: will use Grafana primary database. +# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'. +# memcache: 127.0.0.1:11211 +;connstr = + +# prefix prepended to all the keys in the remote cache +; prefix = + +# This enables encryption of values stored in the remote cache +;encryption = + +#################################### Data proxy ########################### +[dataproxy] + +# This enables data proxy logging, default is false +;logging = false + +# How long the data proxy waits to read the headers of the response before timing out, default is 30 seconds. +# This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set. +;timeout = 30 + +# How long the data proxy waits to establish a TCP connection before timing out, default is 10 seconds. +;dialTimeout = 10 + +# How many seconds the data proxy waits before sending a keepalive probe request. +;keep_alive_seconds = 30 + +# How many seconds the data proxy waits for a successful TLS Handshake before timing out. +;tls_handshake_timeout_seconds = 10 + +# How many seconds the data proxy will wait for a server's first response headers after +# fully writing the request headers if the request has an "Expect: 100-continue" +# header. A value of 0 will result in the body being sent immediately, without +# waiting for the server to approve. +;expect_continue_timeout_seconds = 1 + +# Optionally limits the total number of connections per host, including connections in the dialing, +# active, and idle states. On limit violation, dials will block. +# A value of zero (0) means no limit. +;max_conns_per_host = 0 + +# The maximum number of idle connections that Grafana will keep alive. +;max_idle_connections = 100 + +# How many seconds the data proxy keeps an idle connection open before timing out. +;idle_conn_timeout_seconds = 90 + +# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false. +;send_user_header = false + +# Limit the amount of bytes that will be read/accepted from responses of outgoing HTTP requests. +;response_limit = 0 + +# Limits the number of rows that Grafana will process from SQL data sources. +;row_limit = 1000000 + +# Sets a custom value for the `User-Agent` header for outgoing data proxy requests. If empty, the default value is `Grafana/` (for example `Grafana/9.0.0`). +;user_agent = + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +;reporting_enabled = true + +# The name of the distributor of the Grafana instance. Ex hosted-grafana, grafana-labs +;reporting_distributor = grafana-labs + +# Set to false to disable all checks to https://grafana.com +# for new versions of grafana. The check is used +# in some UI views to notify that a grafana update exists. +# This option does not cause any auto updates, nor send any information +# only a GET request to https://raw.githubusercontent.com/grafana/grafana/main/latest.json to get the latest version. +;check_for_updates = true + +# Set to false to disable all checks to https://grafana.com +# for new versions of plugins. The check is used +# in some UI views to notify that a plugin update exists. +# This option does not cause any auto updates, nor send any information +# only a GET request to https://grafana.com to get the latest versions. +;check_for_plugin_updates = true + +# Google Analytics universal tracking code, only enabled if you specify an id here +;google_analytics_ua_id = + +# Google Analytics 4 tracking code, only enabled if you specify an id here +;google_analytics_4_id = + +# When Google Analytics 4 Enhanced event measurement is enabled, we will try to avoid sending duplicate events and let Google Analytics 4 detect navigation changes, etc. +;google_analytics_4_send_manual_page_views = false + +# Google Tag Manager ID, only enabled if you specify an id here +;google_tag_manager_id = + +# Rudderstack write key, enabled only if rudderstack_data_plane_url is also set +;rudderstack_write_key = + +# Rudderstack data plane url, enabled only if rudderstack_write_key is also set +;rudderstack_data_plane_url = + +# Rudderstack SDK url, optional, only valid if rudderstack_write_key and rudderstack_data_plane_url is also set +;rudderstack_sdk_url = + +# Rudderstack Config url, optional, used by Rudderstack SDK to fetch source config +;rudderstack_config_url = + +# Intercom secret, optional, used to hash user_id before passing to Intercom via Rudderstack +;intercom_secret = + +# Controls if the UI contains any links to user feedback forms +;feedback_links_enabled = true + +#################################### Security #################################### +[security] +# disable creation of admin user on first start of grafana +;disable_initial_admin_creation = false + +# default admin user, created on startup +;admin_user = admin + +# default admin password, can be changed before first start of grafana, or in profile settings +;admin_password = admin + +# default admin email, created on startup +;admin_email = admin@localhost + +# used for signing +;secret_key = SW2YcwTIb9zpOOhoPsMm + +# current key provider used for envelope encryption, default to static value specified by secret_key +;encryption_provider = secretKey.v1 + +# list of configured key providers, space separated (Enterprise only): e.g., awskms.v1 azurekv.v1 +;available_encryption_providers = + +# disable gravatar profile images +;disable_gravatar = false + +# data source proxy whitelist (ip_or_domain:port separated by spaces) +;data_source_proxy_whitelist = + +# disable protection against brute force login attempts +;disable_brute_force_login_protection = false + +# set to true if you host Grafana behind HTTPS. default is false. +;cookie_secure = false + +# set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled" +;cookie_samesite = lax + +# set to true if you want to allow browsers to render Grafana in a ,