Skip to content

Commit

Permalink
Trace mutex locks in otel and fix tracing issues (#5471)
Browse files Browse the repository at this point in the history
  • Loading branch information
dotansimha authored Aug 21, 2024
1 parent 7707b73 commit ce20e3d
Show file tree
Hide file tree
Showing 10 changed files with 106 additions and 31 deletions.
47 changes: 31 additions & 16 deletions deployment/grafana-dashboards/Container-Logs.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
"graphTooltip": 0,
"id": 54,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
Expand All @@ -39,12 +38,14 @@
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"axisSoftMin": 0,
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
Expand All @@ -53,6 +54,7 @@
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
Expand Down Expand Up @@ -107,6 +109,7 @@
"sort": "none"
}
},
"pluginVersion": "11.2.0-74515",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -135,7 +138,6 @@
"id": 4,
"panels": [],
"repeat": "service",
"repeatDirection": "h",
"title": "$service",
"type": "row"
},
Expand All @@ -144,6 +146,10 @@
"type": "loki",
"uid": "grafanacloud-logs"
},
"fieldConfig": {
"defaults": {},
"overrides": []
},
"gridPos": {
"h": 13,
"w": 24,
Expand All @@ -154,21 +160,22 @@
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.2.0-74515",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "grafanacloud-logs"
},
"editorMode": "code",
"expr": "{container_name=\"$service\"} | json | __error__!=\"JSONParserErr\" | level>30 ",
"expr": "{container_name=\"$service\"} | json | regexp `(?P<requestId>[a-f0-9-]{36})`| line_format \"{{ .msg }}\" | __error__ != `JSONParserErr` | level > 30",
"hide": false,
"key": "Q-4f74f5b0-5837-415a-b726-56138193c25e-0",
"queryType": "range",
Expand All @@ -183,6 +190,10 @@
"type": "loki",
"uid": "grafanacloud-logs"
},
"fieldConfig": {
"defaults": {},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 24,
Expand All @@ -200,14 +211,15 @@
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.2.0-74515",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "grafanacloud-logs"
},
"editorMode": "code",
"expr": "{container_name=\"$service\"} | json | __error__!=\"JSONParserErr\" ",
"expr": "{container_name=\"$service\"} | json | __error__ != `JSONParserErr` | regexp `(?P<requestId>[a-f0-9-]{36})` | line_format `{{ .msg }}`",
"queryType": "range",
"refId": "A"
}
Expand All @@ -216,28 +228,24 @@
"type": "logs"
}
],
"preload": false,
"refresh": "30s",
"revision": 1,
"schemaVersion": 38,
"style": "dark",
"schemaVersion": 39,
"tags": ["kubernetes", "logs"],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "rate-limiter",
"value": "rate-limiter"
"text": "graphql-api",
"value": "graphql-api"
},
"datasource": {
"type": "loki",
"uid": "grafanacloud-logs"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "Service:",
"multi": false,
"name": "service",
"options": [],
"query": {
Expand All @@ -248,9 +256,16 @@
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"datasource": {
"type": "loki",
"uid": "grafanacloud-logs"
},
"filters": [],
"name": "Filters",
"type": "adhoc"
}
]
},
Expand All @@ -262,6 +277,6 @@
"timezone": "",
"title": "Container Logs",
"uid": "ih6KH0aVz",
"version": 8,
"version": 9,
"weekStart": ""
}
1 change: 1 addition & 0 deletions deployment/utils/contour.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,7 @@ export interface ContourValues {
};
[k: string]: unknown;
};
defaultStorageClass?: string;
imagePullSecrets?: unknown[];
imageRegistry?: string;
storageClass?: string;
Expand Down
21 changes: 18 additions & 3 deletions deployment/utils/observability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ export class Observability {
},
},
podAnnotations: {
// This is done because open-telemetry collector doesn't always update the deployment
// when the config file changes. This will force-restart it.
'pulumi.com/update-timestamp': Date.now().toString(),
},
clusterRole: {
Expand Down Expand Up @@ -133,21 +135,30 @@ export class Observability {
limit_mib: 409,
spike_limit_mib: 128,
},
// Filter OpenTelemetry traces that are not needed for debugging.
'filter/traces': {
error_mode: 'ignore',
traces: {
span: [
// Ignore all HEAD/OPTIONS requests
'attributes["component"] == "proxy" and attributes["http.method"] == "HEAD"',
'attributes["component"] == "proxy" and attributes["http.method"] == "OPTIONS"',
// Ignore health checks
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/_readiness"',
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/_health"',
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and IsMatch(attributes["http.url"], ".*/_health") == true',
// Ignore Contour/Envoy traces for /usage requests
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and attributes["http.url"] == "/usage"',
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and IsMatch(attributes["upstream_cluster.name"], "default_usage-service-.*") == true',
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and IsMatch(attributes["upstream_cluster.name"], "default_app-.*") == true',
// Ignore metrics scraping
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/metrics"',
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/_readiness"',
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/_health"',
// Ignore webapp HTTP calls
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and IsMatch(attributes["upstream_cluster.name"], "default_app-.*") == true',
],
},
},
// Remove raw trace information that we don't really need and exposed by default.
'attributes/trace_filter': {
actions: [
'downstream_cluster',
Expand All @@ -156,12 +167,12 @@ export class Observability {
'zone',
'upstream_cluster',
'peer.address',
'response_flags',
].map(key => ({
key,
action: 'delete',
})),
},
// Remove attributes that are not needed for debugging.
'resource/trace_cleanup': {
attributes: [
'host.arch',
Expand All @@ -182,6 +193,10 @@ export class Observability {
action: 'delete',
})),
},
// Contour spans are not very human-readable by default, so we are transforming them
// into a format that is easier to understand.
// First, we modify the span URL to be relative, and remove the hostname and full path,
// Then, we rename to be "METHOD /path"
'transform/patch_envoy_spans': {
error_mode: 'ignore',
trace_statements: [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Injectable, Scope } from 'graphql-modules';
import { traceFn } from '@hive/service-common';
import { SchemaChangeType } from '@hive/storage';
import { FederationOrchestrator } from '../orchestrators/federation';
import { StitchingOrchestrator } from '../orchestrators/stitching';
Expand All @@ -14,6 +15,7 @@ import type {
Target,
} from './../../../../shared/entities';
import { ProjectType } from './../../../../shared/entities';
import { Logger } from './../../../shared/providers/logger';
import {
buildSchemaCheckFailureState,
ContractCheckInput,
Expand All @@ -40,6 +42,7 @@ export class CompositeModel {
private federationOrchestrator: FederationOrchestrator,
private stitchingOrchestrator: StitchingOrchestrator,
private checks: RegistryChecks,
private logger: Logger,
) {}

private async getContractChecks(args: {
Expand Down Expand Up @@ -83,6 +86,13 @@ export class CompositeModel {
);
}

@traceFn('Composite modern: check', {
initAttributes: args => ({
'hive.project.id': args.selector.project,
'hive.target.id': args.selector.target,
'hive.organization.id': args.selector.organization,
}),
})
async check({
input,
selector,
Expand Down Expand Up @@ -166,6 +176,7 @@ export class CompositeModel {
});

if (checksumCheck === 'unchanged') {
this.logger.info('No changes detected, skipping schema check');
return {
conclusion: SchemaCheckConclusion.Skip,
};
Expand All @@ -175,6 +186,7 @@ export class CompositeModel {
project.type === ProjectType.FEDERATION
? this.federationOrchestrator
: this.stitchingOrchestrator;
this.logger.debug('Orchestrator: %s', orchestrator);

const compositionCheck = await this.checks.composition({
orchestrator,
Expand Down Expand Up @@ -208,6 +220,7 @@ export class CompositeModel {
compositionCheck,
conditionalBreakingChangeDiffConfig,
});
this.logger.info('Contract checks: %o', contractChecks);

const [diffCheck, policyCheck] = await Promise.all([
this.checks.diff({
Expand All @@ -225,6 +238,8 @@ export class CompositeModel {
modifiedSdl: incoming.sdl,
}),
]);
this.logger.info('diff check status: %o', diffCheck);
this.logger.info('policy check status: %o', policyCheck);

if (
compositionCheck.status === 'failed' ||
Expand All @@ -233,6 +248,7 @@ export class CompositeModel {
// if any of the contract compositions failed, the schema check failed.
(contractChecks?.length && contractChecks.some(check => !isContractChecksSuccessful(check)))
) {
this.logger.debug('Schema check failed');
return {
conclusion: SchemaCheckConclusion.Failure,
state: buildSchemaCheckFailureState({
Expand All @@ -244,6 +260,7 @@ export class CompositeModel {
};
}

this.logger.debug('Schema check successful');
return {
conclusion: SchemaCheckConclusion.Success,
state: {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Injectable, Scope } from 'graphql-modules';
import { traceFn } from '@hive/service-common';
import { SchemaChangeType } from '@hive/storage';
import { SingleOrchestrator } from '../orchestrators/single';
import { ConditionalBreakingChangeDiffConfig, RegistryChecks } from '../registry-checks';
Expand Down Expand Up @@ -26,6 +27,13 @@ export class SingleModel {
private logger: Logger,
) {}

@traceFn('Single modern: check', {
initAttributes: args => ({
'hive.project.id': args.selector.project,
'hive.target.id': args.selector.target,
'hive.organization.id': args.selector.organization,
}),
})
async check({
input,
selector,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import hashObject from 'object-hash';
import { CriticalityLevel } from '@graphql-inspector/core';
import type { CheckPolicyResponse } from '@hive/policy';
import type { CompositionFailureError, ContractsInputType } from '@hive/schema';
import { traceFn } from '@hive/service-common';
import {
HiveSchemaChangeModel,
type RegistryServiceUrlChangeSerializableChange,
Expand Down Expand Up @@ -346,6 +347,7 @@ export class RegistryChecks {
return existingSchemaResult.sdl ?? null;
}

@traceFn('RegistryChecks.policyCheck')
async policyCheck({
selector,
modifiedSdl,
Expand Down Expand Up @@ -397,6 +399,7 @@ export class RegistryChecks {
* Diff incoming and existing SDL and generate a list of changes.
* Uses usage stats to determine whether a change is safe or not (if available).
*/
@traceFn('RegistryChecks.diff')
async diff(args: {
/** The existing SDL */
existingSdl: string | null;
Expand Down
Loading

0 comments on commit ce20e3d

Please sign in to comment.