Skip to content

Cilium Cluster Mesh upgrade (ci-clustermesh) #3236

Cilium Cluster Mesh upgrade (ci-clustermesh)

Cilium Cluster Mesh upgrade (ci-clustermesh) #3236

name: Cilium Cluster Mesh upgrade (ci-clustermesh)
# Any change in triggers needs to be reflected in the concurrency group.
on:
workflow_dispatch:
inputs:
PR-number:
description: "Pull request number."
required: true
context-ref:
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)."
required: true
SHA:
description: "SHA under test (head of the PR branch)."
required: true
extra-args:
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow."
required: false
default: '{}'
# Run every 6 hours
schedule:
- cron: '0 3/6 * * *'
# By specifying the access of one of the scopes, all of those that are not
# specified are set to 'none'.
permissions:
# To be able to access the repository with actions/checkout
contents: read
# To allow retrieving information from the PR API
pull-requests: read
# To be able to set commit status
statuses: write
concurrency:
# Structure:
# - Workflow name
# - Event type
# - A unique identifier depending on event type:
# - schedule: SHA
# - workflow_dispatch: PR number
#
# This structure ensures a unique concurrency group name is generated for each
# type of testing, such that re-runs will cancel the previous run.
group: |
${{ github.workflow }}
${{ github.event_name }}
${{
(github.event_name == 'schedule' && github.sha) ||
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number)
}}
cancel-in-progress: true
env:
# renovate: datasource=github-releases depName=kubernetes-sigs/kind
kind_version: v0.20.0
# renovate: datasource=docker depName=kindest/node
k8s_version: v1.27.3
# renovate: datasource=github-releases depName=cilium/cilium-cli
cilium_cli_version: v0.15.19
cilium_cli_ci_version:
clusterName1: cluster1
clusterName2: cluster2
contextName1: kind-cluster1
contextName2: kind-cluster2
jobs:
commit-status-start:
name: Commmit Status Start
runs-on: ubuntu-latest
steps:
- name: Set initial commit status
uses: myrotvorets/set-commit-status-action@38f3f27c7d52fb381273e95542f07f0fba301307 # v2.0.0
with:
sha: ${{ inputs.SHA || github.sha }}
upgrade-and-downgrade:
name: "Upgrade and Downgrade Test"
runs-on: ${{ vars.GH_RUNNER_EXTRA_POWER }}
timeout-minutes: 60
env:
job_name: "Installation and Connectivity Test"
strategy:
fail-fast: false
matrix:
include:
- name: '1'
encryption: 'disabled'
kube-proxy: 'iptables'
steps:
- name: Checkout context ref (trusted)
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
ref: ${{ inputs.context-ref || github.sha }}
persist-credentials: false
- name: Set Environment Variables
uses: ./.github/actions/set-env-variables
- name: Set up job variables
id: vars
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
SHA="${{ inputs.SHA }}"
else
SHA="${{ github.sha }}"
fi
CILIUM_DOWNGRADE_VERSION=$(contrib/scripts/print-downgrade-version.sh)
CILIUM_IMAGE_SETTINGS=" \
--chart-directory=./untrusted/cilium-newest/install/kubernetes/cilium \
--set=image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${SHA} \
--set=operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${SHA} \
--set=clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${SHA} \
"
# * bpf.masquerade is disabled due to #23283
# * Hubble is disabled to avoid the performance penalty in the testing
# environment due to the relatively high traffic load.
CILIUM_INSTALL_DEFAULTS=" \
--set=debug.enabled=true \
--set=bpf.masquerade=false \
--set=bpf.monitorAggregation=none \
--set=hubble.enabled=false \
--set=tunnel=vxlan \
--set=ipv4.enabled=true \
--set=ipv6.enabled=true \
--set=clustermesh.useAPIServer=true \
--set=clustermesh.config.enabled=true"
# Run only a limited subset of tests to reduce the amount of time
# required. The full suite is run in conformance-clustermesh.
CONNECTIVITY_TEST_DEFAULTS=" \
--hubble=false \
--flow-validation=disabled \
--test='no-policies/' \
--test='no-policies-extra/' \
--test='allow-all-except-world/' \
--test='client-ingress/' \
--test='client-egress/' \
--test='cluster-entity-multicluster/' \
--test='!/pod-to-world' \
--test='!/pod-to-cidr' \
--collect-sysdump-on-failure"
CILIUM_INSTALL_ENCRYPTION=""
if [ "${{ matrix.encryption }}" != "disabled" ]; then
CILIUM_INSTALL_ENCRYPTION=" \
--set=encryption.enabled=true \
--set=encryption.type=${{ matrix.encryption }}"
fi
echo "sha=${SHA}" >> $GITHUB_OUTPUT
echo "downgrade_version=${CILIUM_DOWNGRADE_VERSION}" >> $GITHUB_OUTPUT
echo "cilium_image_settings=${CILIUM_IMAGE_SETTINGS}" >> $GITHUB_OUTPUT
echo "cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} ${CILIUM_INSTALL_ENCRYPTION}" >> $GITHUB_OUTPUT
echo "connectivity_test_defaults=${CONNECTIVITY_TEST_DEFAULTS}" >> $GITHUB_OUTPUT
- name: Install Cilium CLI
uses: cilium/cilium-cli@32109b32259a487c803648a3c9463cdcafa4c0c3 # v0.15.19
with:
release-version: ${{ env.cilium_cli_version }}
ci-version: ${{ env.cilium_cli_ci_version }}
- name: Generate Kind configuration files
run: |
K8S_VERSION=${{ env.k8s_version }} \
PODCIDR=10.242.0.0/16,fd00:10:242::/48 \
SVCCIDR=10.243.0.0/16,fd00:10:243::/112 \
IPFAMILY=dual \
KUBEPROXYMODE=${{ matrix.kube-proxy }} \
envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster1.yaml
K8S_VERSION=${{ env.k8s_version }} \
PODCIDR=10.244.0.0/16,fd00:10:244::/48 \
SVCCIDR=10.245.0.0/16,fd00:10:245::/112 \
IPFAMILY=dual \
KUBEPROXYMODE=${{ matrix.kube-proxy }} \
envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster2.yaml
- name: Create Kind cluster 1
uses: helm/kind-action@dda0770415bac9fc20092cacbc54aa298604d140 # v1.8.0
with:
cluster_name: ${{ env.clusterName1 }}
version: ${{ env.kind_version }}
kubectl_version: ${{ env.k8s_version }}
config: ./.github/kind-config-cluster1.yaml
wait: 0 # The control-plane never becomes ready, since no CNI is present
- name: Create Kind cluster 2
uses: helm/kind-action@dda0770415bac9fc20092cacbc54aa298604d140 # v1.8.0
with:
cluster_name: ${{ env.clusterName2 }}
version: ${{ env.kind_version }}
kubectl_version: ${{ env.k8s_version }}
config: ./.github/kind-config-cluster2.yaml
wait: 0 # The control-plane never becomes ready, since no CNI is present
- name: Create the IPSec secret in both clusters
if: matrix.encryption == 'ipsec'
run: |
SECRET="3 rfc4106(gcm(aes)) $(openssl rand -hex 20) 128"
kubectl --context ${{ env.contextName1 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}"
kubectl --context ${{ env.contextName2 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}"
- name: Set clustermesh connection parameters
id: clustermesh-vars
run: |
# Let's retrieve in advance the parameters to mesh the two clusters, so
# that we don't need to do that through the CLI in a second step, as it
# would be reset during upgrade (as we are resetting the values).
IP1=$(kubectl --context ${{ env.contextName1 }} get nodes \
${{ env.clusterName1 }}-worker -o wide --no-headers | awk '{ print $6 }')
IP2=$(kubectl --context ${{ env.contextName2 }} get nodes \
${{ env.clusterName2 }}-worker -o wide --no-headers | awk '{ print $6 }')
# Explicitly configure the NodePorts to make sure that they are different
# in each cluster, to workaround #24692
PORT1=32379
PORT2=32380
# Generate the TLS certificates and explicitly configure them
openssl genrsa 4096 > cilium-ca-key.pem
openssl genrsa 4096 > cm-remote-key.pem
openssl req -new -x509 -nodes -days 1 -key cilium-ca-key.pem -out cilium-ca-crt.pem -subj "/CN=Cilium CA/"
openssl req -new -x509 -nodes -days 1 -subj "/CN=remote/" \
-key cm-remote-key.pem -out cm-remote-crt.pem \
-CA cilium-ca-crt.pem -CAkey cilium-ca-key.pem
CILIUM_INSTALL_TLS=" \
--set tls.ca.cert=$(base64 -w0 cilium-ca-crt.pem) \
--set tls.ca.key=$(base64 -w0 cilium-ca-key.pem) \
--set clustermesh.apiserver.tls.ca.cert=$(base64 -w0 cilium-ca-crt.pem) \
--set clustermesh.apiserver.tls.ca.key=$(base64 -w0 cilium-ca-key.pem) \
--set clustermesh.config.clusters[0].tls.cert=$(base64 -w0 cm-remote-crt.pem) \
--set clustermesh.config.clusters[0].tls.key=$(base64 -w0 cm-remote-key.pem) \
"
CILIUM_INSTALL_CLUSTER1=" \
--set cluster.name=${{ env.clusterName1 }} \
--set cluster.id=1 \
--set clustermesh.apiserver.service.nodePort=$PORT1 \
--set clustermesh.config.clusters[0].name=${{ env.clusterName2 }} \
--set clustermesh.config.clusters[0].ips={$IP2} \
--set clustermesh.config.clusters[0].port=$PORT2"
CILIUM_INSTALL_CLUSTER2=" \
--set cluster.name=${{ env.clusterName2 }} \
--set cluster.id=255 \
--set clustermesh.apiserver.service.nodePort=$PORT2 \
--set clustermesh.config.clusters[0].name=${{ env.clusterName1 }} \
--set clustermesh.config.clusters[0].ips={$IP1} \
--set clustermesh.config.clusters[0].port=$PORT1"
echo cilium_install_cluster1="$CILIUM_INSTALL_CLUSTER1 $CILIUM_INSTALL_TLS" >> $GITHUB_OUTPUT
echo cilium_install_cluster2="$CILIUM_INSTALL_CLUSTER2 $CILIUM_INSTALL_TLS" >> $GITHUB_OUTPUT
# Warning: since this is a privileged workflow, subsequent workflow job
# steps must take care not to execute untrusted code.
- name: Checkout pull request branch (NOT TRUSTED)
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
ref: ${{ steps.vars.outputs.sha }}
persist-credentials: false
path: untrusted/cilium-newest
sparse-checkout: |
install/kubernetes/cilium
- name: Checkout ${{ steps.vars.outputs.downgrade_version }} branch
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
ref: ${{ steps.vars.outputs.downgrade_version }}
persist-credentials: false
path: untrusted/cilium-downgrade
sparse-checkout: |
install/kubernetes/cilium
- name: Set up downgrade settings
id: downgrade-vars
run: |
SHA="$(cd untrusted/cilium-downgrade && git rev-parse HEAD)"
CILIUM_IMAGE_SETTINGS=" \
--chart-directory=./untrusted/cilium-downgrade/install/kubernetes/cilium \
--set=image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${SHA} \
--set=operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${SHA} \
--set=clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${SHA} \
"
echo "sha=${SHA}" >> $GITHUB_OUTPUT
echo "cilium_image_settings=${CILIUM_IMAGE_SETTINGS}" >> $GITHUB_OUTPUT
- name: Wait for images to be available (newest)
timeout-minutes: 10
shell: bash
run: |
for image in cilium-ci operator-generic-ci hubble-relay-ci clustermesh-apiserver-ci ; do
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.sha }} &> /dev/null; do sleep 45s; done
done
- name: Wait for images to be available (downgrade)
timeout-minutes: 10
shell: bash
run: |
for image in cilium-ci operator-generic-ci hubble-relay-ci clustermesh-apiserver-ci ; do
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.downgrade-vars.outputs.sha }} &> /dev/null; do sleep 45s; done
done
- name: Install Cilium in cluster1
id: install-cilium-cluster1
run: |
cilium --context ${{ env.contextName1 }} install \
${{ steps.downgrade-vars.outputs.cilium_image_settings }} \
${{ steps.vars.outputs.cilium_install_defaults }} \
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }}
- name: Install Cilium in cluster2
run: |
cilium --context ${{ env.contextName2 }} install \
${{ steps.vars.outputs.cilium_image_settings }} \
${{ steps.vars.outputs.cilium_install_defaults }} \
${{ steps.clustermesh-vars.outputs.cilium_install_cluster2 }}
- name: Wait for cluster mesh status to be ready
run: |
cilium --context ${{ env.contextName1 }} status --wait
cilium --context ${{ env.contextName2 }} status --wait
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m
- name: Make JUnit report directory
run: |
mkdir -p cilium-junits
- name: Run connectivity test - pre-upgrade (${{ join(matrix.*, ', ') }})
run: |
cilium --context ${{ env.contextName1 }} connectivity test \
--multi-cluster=${{ env.contextName2 }} \
${{ steps.vars.outputs.connectivity_test_defaults }} \
--junit-file "cilium-junits/${{ env.job_name }} - pre-upgrade (${{ join(matrix.*, ', ') }}).xml" \
--junit-property github_job_step="Run tests pre-upgrade (${{ join(matrix.*, ', ') }})"
- name: Upgrade Cilium in cluster1
run: |
cilium --context ${{ env.contextName1 }} upgrade --reset-values \
${{ steps.vars.outputs.cilium_image_settings }} \
${{ steps.vars.outputs.cilium_install_defaults }} \
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }}
- name: Rollout Cilium agents in cluster2
run: |
# This makes sure that the remote agents reconnect to the new instance of the
# clustermesh-apiserver, without waiting for the watchdog mechanism to kick in.
kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium
- name: Wait for cluster mesh status to be ready
run: |
cilium --context ${{ env.contextName1 }} status --wait
cilium --context ${{ env.contextName2 }} status --wait
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m
- name: Run connectivity test - post-upgrade (${{ join(matrix.*, ', ') }})
run: |
cilium --context ${{ env.contextName1 }} connectivity test \
--multi-cluster=${{ env.contextName2 }} \
${{ steps.vars.outputs.connectivity_test_defaults }} \
--junit-file "cilium-junits/${{ env.job_name }} - post upgrade (${{ join(matrix.*, ', ') }}).xml" \
--junit-property github_job_step="Run tests post-upgrade (${{ join(matrix.*, ', ') }})"
- name: Downgrade Cilium in cluster1
run: |
cilium --context ${{ env.contextName1 }} upgrade --reset-values \
${{ steps.downgrade-vars.outputs.cilium_image_settings }} \
${{ steps.vars.outputs.cilium_install_defaults }} \
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }}
- name: Rollout Cilium agents in cluster2
run: |
# This makes sure that the remote agents reconnect to the new instance of the
# clustermesh-apiserver, without waiting for the watchdog mechanism to kick in.
kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium
- name: Wait for cluster mesh status to be ready
run: |
cilium --context ${{ env.contextName1 }} status --wait
cilium --context ${{ env.contextName2 }} status --wait
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m
- name: Run connectivity test - post-downgrade (${{ join(matrix.*, ', ') }})
run: |
cilium --context ${{ env.contextName1 }} connectivity test \
--multi-cluster=${{ env.contextName2 }} \
${{ steps.vars.outputs.connectivity_test_defaults }} \
--junit-file "cilium-junits/${{ env.job_name }} - post downgrade (${{ join(matrix.*, ', ') }}).xml" \
--junit-property github_job_step="Run tests post-downgrade (${{ join(matrix.*, ', ') }})"
- name: Post-test information gathering
if: ${{ !success() && steps.install-cilium-cluster1.outcome != 'skipped' }}
run: |
cilium --context ${{ env.contextName1 }} status
cilium --context ${{ env.contextName1 }} clustermesh status
cilium --context ${{ env.contextName2 }} status
cilium --context ${{ env.contextName2 }} clustermesh status
kubectl config use-context ${{ env.contextName1 }}
kubectl get pods --all-namespaces -o wide
cilium sysdump --output-filename cilium-sysdump-context1-final-${{ join(matrix.*, '-') }}
kubectl config use-context ${{ env.contextName2 }}
kubectl get pods --all-namespaces -o wide
cilium sysdump --output-filename cilium-sysdump-context2-final-${{ join(matrix.*, '-') }}
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
- name: Upload artifacts
if: ${{ !success() }}
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
with:
name: cilium-sysdumps-${{ matrix.name }}
path: cilium-sysdump-*.zip
- name: Upload JUnits [junit]
if: ${{ always() }}
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
with:
name: cilium-junits-${{ matrix.name }}
path: cilium-junits/*.xml
- name: Publish Test Results As GitHub Summary
if: ${{ always() }}
uses: aanm/junit2md@332ebf0fddd34e91b03a832cfafaa826306558f9 # v0.0.3
with:
junit-directory: "cilium-junits"
merge-upload:
if: ${{ always() }}
name: Merge and Upload Artifacts
runs-on: ubuntu-latest
needs: upgrade-and-downgrade
steps:
- name: Merge Sysdumps
if: ${{ needs.upgrade-and-downgrade.result == 'failure' }}
uses: actions/upload-artifact/merge@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
with:
name: cilium-sysdumps
pattern: cilium-sysdumps-*
retention-days: 5
delete-merged: true
continue-on-error: true
- name: Merge JUnits
uses: actions/upload-artifact/merge@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
with:
name: cilium-junits
pattern: cilium-junits-*
retention-days: 5
delete-merged: true
commit-status-final:
if: ${{ always() }}
name: Commit Status Final
needs: upgrade-and-downgrade
runs-on: ubuntu-latest
steps:
- name: Set final commit status
uses: myrotvorets/set-commit-status-action@38f3f27c7d52fb381273e95542f07f0fba301307 # v2.0.0
with:
sha: ${{ inputs.SHA || github.sha }}
status: ${{ needs.upgrade-and-downgrade.result }}