Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 84 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,26 @@ jobs:
done
echo "All ${{ matrix.environment }} manifests rendered successfully."

# The druid catalog chart deploys through the druid-tenants
# ApplicationSet (base values layered with per-tenant values), so the
# overlay loop above never touches it. Render it with synthetic tenant
# values so the chart's templates pass through the same render-assert,
# schema, and misconfiguration gates as every kustomize root.
- name: Render druid catalog chart
run: |
helm template ci catalog/druid/chart -n druid-ci \
-f catalog/druid/values.yaml \
--set hostedId=ci \
--set environment=${{ matrix.environment }} \
--set secrets.metadata=ci/druid/metadata \
--set secrets.admin=ci/druid/admin \
--set secrets.system=ci/druid/system \
--set secrets.keystore=ci/druid/keystore \
--set s3.indexLogsBucket=ci-druid-indexlogs \
--set s3.deepStorageBucket=ci-druid-deepstorage \
--set s3.msqBucket=ci-druid-msq \
> rendered/catalog_druid_chart.yaml

# Source files are gated by the zero-placeholder job; this asserts the
# RENDERED output too — catching a zero account id, placeholder token, or
# account-less ARN that only appears after kustomize/helm templating.
Expand All @@ -77,6 +97,69 @@ jobs:
manifests: rendered
exclude: 'mcp-tunnel'

# ── Schema gate ────────────────────────────────────────────────────
# kubeconform in strict mode over everything just rendered. Native
# kinds resolve from the default kubernetes-json-schema location; CRD
# kinds resolve from the datreeio CRDs-catalog. There is deliberately
# no -ignore-missing-schemas: a kind neither source knows fails the
# build until it gets a schema or an explicit, justified -skip below.
# -skip Grafana — the single external Grafana CR ships
# spec.external.url empty by design (the dashboards ApplicationSet
# injects the per-cluster Amazon Managed Grafana URL via its cluster
# generator patch), and the catalog schema is stricter than the
# upstream CRD (requires tenantNamespace, which upstream does not).
#
# The binary and downloaded schemas are cached; restore-keys lets each
# run reuse the newest cache while still persisting newly fetched
# schemas.
- name: Cache kubeconform + schemas
uses: actions/cache@v6
with:
path: |
~/.local/bin/kubeconform
~/.cache/kubeconform
key: kubeconform-v0.8.0-${{ github.run_id }}
restore-keys: |
kubeconform-v0.8.0-

- name: Install kubeconform
run: |
if [ ! -x "$HOME/.local/bin/kubeconform" ]; then
mkdir -p "$HOME/.local/bin"
curl -fsSL https://github.com/yannh/kubeconform/releases/download/v0.8.0/kubeconform-linux-amd64.tar.gz \
| tar -xz -C "$HOME/.local/bin" kubeconform
fi
echo "$HOME/.local/bin" >> "$GITHUB_PATH"

- name: Schema gate (kubeconform)
run: |
mkdir -p "$HOME/.cache/kubeconform"
kubeconform -strict -summary \
-cache "$HOME/.cache/kubeconform" \
-schema-location default \
-schema-location 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' \
-skip Grafana \
rendered

# ── Misconfiguration gate ──────────────────────────────────────────
# trivy config over the rendered manifests — not the repo tree — so
# every check sees the post-templating truth with helm values applied.
# MEDIUM and above hard-fails; .trivyignore.yaml carries the scoped,
# justified exceptions (device-plugin root/hostPath, druid's
# k8s-extensions Role, env-indirected config keys).
- name: Install trivy
uses: aquasecurity/setup-trivy@v0.3.1
with:
version: v0.72.0
cache: true

- name: Misconfiguration gate (trivy config)
run: |
trivy config --exit-code 1 \
--severity MEDIUM,HIGH,CRITICAL \
--ignorefile .trivyignore.yaml \
rendered

pr-summary:
name: PR Summary
runs-on: ubuntu-latest
Expand All @@ -93,7 +176,7 @@ jobs:
: r === 'skipped' ? ':fast_forward:' : ':x:';
const results = {
'YAML Lint': '${{ needs.lint.result }}',
'Render + assert (all environments)': '${{ needs.validate.result }}',
'Render + assert + schema + misconfig (all environments)': '${{ needs.validate.result }}',
};
const rows = Object.entries(results)
.map(([check, r]) => `| ${check} | ${icon(r)} ${r} |`)
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
*secret*.json
!*secret*-store*.yaml
!*secret*-store*.yml
# ExternalSecret manifests are references into the secret store, not secret
# material — they must ship with the charts that need them.
!*externalsecret*.yaml
!*external-secret*.yaml

# IAM roles / sensitive AWS config
*role-trust-policy*.json
Expand Down
92 changes: 92 additions & 0 deletions .trivyignore.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Scoped, justified exceptions for the trivy misconfiguration gate in CI
# (validate job, `trivy config ... rendered`). Paths match rendered-manifest
# file names, so each exception applies to exactly one addon's output — a new
# finding anywhere else still fails the gate until it is fixed or added here
# with a statement.
misconfigurations:
# ── aws-neuron-device-plugin ──────────────────────────────────────────
# Kubernetes device plugin: root and hostPath are the device-plugin
# contract, not a misconfiguration.
- id: KSV-0012
paths:
- "*aws-neuron-device-plugin*"
statement: >-
The device plugin runs as root to open /dev/neuron* devices and
register over the kubelet's device-plugins socket.
- id: KSV-0023
paths:
- "*aws-neuron-device-plugin*"
statement: >-
hostPath mounts of the kubelet device-plugins directory (and the
Neuron driver paths) are how a device plugin talks to the kubelet;
there is no volume type that replaces them.
- id: KSV-0125
paths:
- "*aws-neuron-device-plugin*"
statement: >-
Registry-allowlist check; the image ships from AWS's public Neuron
registry. Image provenance is enforced cluster-side by the Kyverno
verify-images policy (policies/kyverno/supply-chain).

# ── druid catalog chart ───────────────────────────────────────────────
- id: KSV-0014
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
The apache/druid entrypoint and the JVM write inside the container
filesystem at startup; scratch/task data already lives on the
druid-scratch emptyDir at /var/druid. readOnlyRootFilesystem is not
enabled until it has been verified against the upstream image.
- id: KSV-0109
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
The runtime.properties ConfigMaps contain keys named *Password whose
values are ${env:...} references resolved in-pod from
ExternalSecret-backed env vars — no secret material is stored in the
ConfigMap.
- id: KSV-01010
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
Same as KSV-0109 — password-named keys hold environment-variable
indirection, not values.
- id: KSV-0042
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
The namespace-scoped Role (enumerated verbs, no wildcards) backs
druid-kubernetes-overlord-extensions: the overlord launches ingestion
tasks as Jobs and manages their pods and logs.
- id: KSV-0048
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
Same Role — Job/pod management is the Druid k8s task runner's job.
- id: KSV-0049
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
Same Role — druid-kubernetes-extensions uses ConfigMaps for node
announcements and leader election.
- id: KSV-0113
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
Same Role — task pods created by the overlord consume the
chart-managed secrets in their own namespace.
- id: KSV-0125
paths:
- "*catalog_druid_chart*"
- "*catalog-druid-chart*"
statement: >-
Registry-allowlist check; apache/druid is the upstream vendor image.
Image provenance is enforced cluster-side by the Kyverno verify-images
policy (policies/kyverno/supply-chain).
4 changes: 3 additions & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ task lint:yaml # YAML lint all files
task kustomize:build # Build all overlays (all environments)
task kustomize:build:env # Build overlays for ENVIRONMENT (default: dev)
task validate # Lint + build combined
task render # Render manifests to rendered/ directory
task render # Render manifests to rendered/ (incl. druid chart)
task scan # kubeconform + trivy config gates over rendered/
```

## Relationship to Parent Repo
Expand All @@ -83,6 +84,7 @@ task render # Render manifests to rendered/ directory
## CI

- PR and push to main trigger `.github/workflows/ci.yml` (lint → validate per environment → PR summary)
- The validate job renders every kustomize root plus the druid catalog chart, then gates the rendered output: render-assert (no unfilled sentinels), kubeconform strict (native schemas + datreeio CRDs-catalog, no ignore-missing-schemas), and `trivy config` (misconfiguration scan, MEDIUM+ hard-fails; scoped justified exceptions live in `.trivyignore.yaml`)
- Manual diff rendering available via `.github/workflows/diff.yml`

## Claude Code Tooling
Expand Down
30 changes: 30 additions & 0 deletions Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,21 @@ tasks:
cmds:
- echo "All validations passed."

scan:
desc: "Schema + misconfiguration gates on rendered/ (run task render first)"
cmds:
- |
kubeconform -strict -summary \
-schema-location default \
-schema-location 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{"{{"}}.Group{{"}}"}}/{{"{{"}}.ResourceKind{{"}}"}}_{{"{{"}}.ResourceAPIVersion{{"}}"}}.json' \
-skip Grafana \
rendered
- |
trivy config --exit-code 1 \
--severity MEDIUM,HIGH,CRITICAL \
--ignorefile .trivyignore.yaml \
rendered

# === Rendering ===

render:
Expand All @@ -74,6 +89,21 @@ tasks:
echo "Rendering $dir ..."
kustomize build --enable-helm "$dir" > "rendered/${name}.yaml" || exit 1
done
# Druid catalog chart with synthetic tenant values — same render CI
# gates (see .github/workflows/ci.yml).
echo "Rendering catalog/druid/chart ..."
helm template ci catalog/druid/chart -n druid-ci \
-f catalog/druid/values.yaml \
--set hostedId=ci \
--set environment={{.ENVIRONMENT}} \
--set secrets.metadata=ci/druid/metadata \
--set secrets.admin=ci/druid/admin \
--set secrets.system=ci/druid/system \
--set secrets.keystore=ci/druid/keystore \
--set s3.indexLogsBucket=ci-druid-indexlogs \
--set s3.deepStorageBucket=ci-druid-deepstorage \
--set s3.msqBucket=ci-druid-msq \
> rendered/catalog-druid-chart.yaml
echo "All {{.ENVIRONMENT}} overlays rendered to rendered/"

clean:
Expand Down
51 changes: 46 additions & 5 deletions catalog/druid/chart/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,15 @@ Labels & Annotations
{{- end }}
{{- end -}}

{{/*
Selector labels. Only the component label — the {domain}/name label already
reaches every resource through common.labels, and emitting it here too would
render the same map key twice wherever templates combine component.labels
with match.labels.
*/}}
{{- define "druid.component.match.labels" -}}
{{- $component := index . 0 -}}
{{- $ctx := index . 1 -}}
{{ $ctx.Values.domain }}/name: {{ $ctx.Values.name }}
{{ $ctx.Values.domain }}/component: {{ include "druid.component.name" (list $component $ctx) }}
{{- end -}}

Expand Down Expand Up @@ -105,6 +110,22 @@ securityContext:
{{- toYaml $ctx.Values.securityContext | nindent 2 }}
{{- end -}}

{{/*
Container-level hardening, shared by every Druid container. The JVM runs as
uid 1000 (pod securityContext), never needs privilege escalation, holds no
capabilities (all listen ports are >1024), and is happy under the runtime's
default seccomp profile.
*/}}
{{- define "druid.container.security" -}}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
seccompProfile:
type: RuntimeDefault
{{- end -}}

{{/*
Environment — computed from release name
*/}}
Expand Down Expand Up @@ -177,6 +198,11 @@ Environment — computed from release name
secretKeyRef:
name: {{ include "druid.name" . }}-druid-system
key: password
- name: DRUID_TLS_KEYSTORE_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "druid.name" . }}-keystore-password
key: password
{{- with .Values.extraEnv }}
{{ . | toYaml }}
{{- end }}
Expand Down Expand Up @@ -274,7 +300,16 @@ JVM
{{- end -}}

{{/*
Probes
Probes — (port, healthPath, readinessPath).

Every process in this chart (coordinator, overlord, broker, historical, router)
exposes /status/health — plus the broker/historical readiness endpoints — on its
service port, so all probes are real httpGet checks; none of the components
lacks an HTTP health surface, so there is no tcpSocket fallback. The cluster
runs TLS-only (druid.enablePlaintextPort=false), hence scheme HTTPS: the
kubelet skips certificate verification for HTTPS probes, so the internal CA is
fine, and Druid serves these paths unauthenticated (they are on its default
unsecured-path list) with client certs requested but not required.
*/}}

{{- define "druid.probes" -}}
Expand All @@ -283,24 +318,30 @@ Probes
{{- $readinessPath := index . 2 -}}
livenessProbe:
failureThreshold: 3
tcpSocket:
httpGet:
path: {{ $healthPath }}
port: {{ $port }}
scheme: HTTPS
initialDelaySeconds: 180
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
readinessProbe:
failureThreshold: 10
tcpSocket:
httpGet:
path: {{ $readinessPath }}
port: {{ $port }}
scheme: HTTPS
initialDelaySeconds: 180
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
startupProbe:
failureThreshold: 60
tcpSocket:
httpGet:
path: {{ $healthPath }}
port: {{ $port }}
scheme: HTTPS
initialDelaySeconds: 60
periodSeconds: 10
successThreshold: 1
Expand Down
1 change: 1 addition & 0 deletions catalog/druid/chart/templates/broker/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ spec:
containers:
- name: {{ include "druid.component.name" (list "broker" .) }}
{{- include "druid.image" . | nindent 10 }}
{{- include "druid.container.security" . | nindent 10 }}
args: ["broker"]
{{- include "druid.ports" (list "broker" 8282) | nindent 10 }}
env:
Expand Down
Loading
Loading