Cilium Cluster Mesh upgrade (ci-clustermesh) #3236
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Cilium Cluster Mesh upgrade (ci-clustermesh) | |
# Any change in triggers needs to be reflected in the concurrency group. | |
on: | |
workflow_dispatch: | |
inputs: | |
PR-number: | |
description: "Pull request number." | |
required: true | |
context-ref: | |
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)." | |
required: true | |
SHA: | |
description: "SHA under test (head of the PR branch)." | |
required: true | |
extra-args: | |
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow." | |
required: false | |
default: '{}' | |
# Run every 6 hours | |
schedule: | |
- cron: '0 3/6 * * *' | |
# By specifying the access of one of the scopes, all of those that are not | |
# specified are set to 'none'. | |
permissions: | |
# To be able to access the repository with actions/checkout | |
contents: read | |
# To allow retrieving information from the PR API | |
pull-requests: read | |
# To be able to set commit status | |
statuses: write | |
concurrency: | |
# Structure: | |
# - Workflow name | |
# - Event type | |
# - A unique identifier depending on event type: | |
# - schedule: SHA | |
# - workflow_dispatch: PR number | |
# | |
# This structure ensures a unique concurrency group name is generated for each | |
# type of testing, such that re-runs will cancel the previous run. | |
group: | | |
${{ github.workflow }} | |
${{ github.event_name }} | |
${{ | |
(github.event_name == 'schedule' && github.sha) || | |
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number) | |
}} | |
cancel-in-progress: true | |
env: | |
# renovate: datasource=github-releases depName=kubernetes-sigs/kind | |
kind_version: v0.20.0 | |
# renovate: datasource=docker depName=kindest/node | |
k8s_version: v1.27.3 | |
# renovate: datasource=github-releases depName=cilium/cilium-cli | |
cilium_cli_version: v0.15.19 | |
cilium_cli_ci_version: | |
clusterName1: cluster1 | |
clusterName2: cluster2 | |
contextName1: kind-cluster1 | |
contextName2: kind-cluster2 | |
jobs: | |
commit-status-start: | |
name: Commmit Status Start | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set initial commit status | |
uses: myrotvorets/set-commit-status-action@38f3f27c7d52fb381273e95542f07f0fba301307 # v2.0.0 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
upgrade-and-downgrade: | |
name: "Upgrade and Downgrade Test" | |
runs-on: ${{ vars.GH_RUNNER_EXTRA_POWER }} | |
timeout-minutes: 60 | |
env: | |
job_name: "Installation and Connectivity Test" | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- name: '1' | |
encryption: 'disabled' | |
kube-proxy: 'iptables' | |
steps: | |
- name: Checkout context ref (trusted) | |
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | |
with: | |
ref: ${{ inputs.context-ref || github.sha }} | |
persist-credentials: false | |
- name: Set Environment Variables | |
uses: ./.github/actions/set-env-variables | |
- name: Set up job variables | |
id: vars | |
run: | | |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then | |
SHA="${{ inputs.SHA }}" | |
else | |
SHA="${{ github.sha }}" | |
fi | |
CILIUM_DOWNGRADE_VERSION=$(contrib/scripts/print-downgrade-version.sh) | |
CILIUM_IMAGE_SETTINGS=" \ | |
--chart-directory=./untrusted/cilium-newest/install/kubernetes/cilium \ | |
--set=image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${SHA} \ | |
--set=operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${SHA} \ | |
--set=clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${SHA} \ | |
" | |
# * bpf.masquerade is disabled due to #23283 | |
# * Hubble is disabled to avoid the performance penalty in the testing | |
# environment due to the relatively high traffic load. | |
CILIUM_INSTALL_DEFAULTS=" \ | |
--set=debug.enabled=true \ | |
--set=bpf.masquerade=false \ | |
--set=bpf.monitorAggregation=none \ | |
--set=hubble.enabled=false \ | |
--set=tunnel=vxlan \ | |
--set=ipv4.enabled=true \ | |
--set=ipv6.enabled=true \ | |
--set=clustermesh.useAPIServer=true \ | |
--set=clustermesh.config.enabled=true" | |
# Run only a limited subset of tests to reduce the amount of time | |
# required. The full suite is run in conformance-clustermesh. | |
CONNECTIVITY_TEST_DEFAULTS=" \ | |
--hubble=false \ | |
--flow-validation=disabled \ | |
--test='no-policies/' \ | |
--test='no-policies-extra/' \ | |
--test='allow-all-except-world/' \ | |
--test='client-ingress/' \ | |
--test='client-egress/' \ | |
--test='cluster-entity-multicluster/' \ | |
--test='!/pod-to-world' \ | |
--test='!/pod-to-cidr' \ | |
--collect-sysdump-on-failure" | |
CILIUM_INSTALL_ENCRYPTION="" | |
if [ "${{ matrix.encryption }}" != "disabled" ]; then | |
CILIUM_INSTALL_ENCRYPTION=" \ | |
--set=encryption.enabled=true \ | |
--set=encryption.type=${{ matrix.encryption }}" | |
fi | |
echo "sha=${SHA}" >> $GITHUB_OUTPUT | |
echo "downgrade_version=${CILIUM_DOWNGRADE_VERSION}" >> $GITHUB_OUTPUT | |
echo "cilium_image_settings=${CILIUM_IMAGE_SETTINGS}" >> $GITHUB_OUTPUT | |
echo "cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} ${CILIUM_INSTALL_ENCRYPTION}" >> $GITHUB_OUTPUT | |
echo "connectivity_test_defaults=${CONNECTIVITY_TEST_DEFAULTS}" >> $GITHUB_OUTPUT | |
- name: Install Cilium CLI | |
uses: cilium/cilium-cli@32109b32259a487c803648a3c9463cdcafa4c0c3 # v0.15.19 | |
with: | |
release-version: ${{ env.cilium_cli_version }} | |
ci-version: ${{ env.cilium_cli_ci_version }} | |
- name: Generate Kind configuration files | |
run: | | |
K8S_VERSION=${{ env.k8s_version }} \ | |
PODCIDR=10.242.0.0/16,fd00:10:242::/48 \ | |
SVCCIDR=10.243.0.0/16,fd00:10:243::/112 \ | |
IPFAMILY=dual \ | |
KUBEPROXYMODE=${{ matrix.kube-proxy }} \ | |
envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster1.yaml | |
K8S_VERSION=${{ env.k8s_version }} \ | |
PODCIDR=10.244.0.0/16,fd00:10:244::/48 \ | |
SVCCIDR=10.245.0.0/16,fd00:10:245::/112 \ | |
IPFAMILY=dual \ | |
KUBEPROXYMODE=${{ matrix.kube-proxy }} \ | |
envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster2.yaml | |
- name: Create Kind cluster 1 | |
uses: helm/kind-action@dda0770415bac9fc20092cacbc54aa298604d140 # v1.8.0 | |
with: | |
cluster_name: ${{ env.clusterName1 }} | |
version: ${{ env.kind_version }} | |
kubectl_version: ${{ env.k8s_version }} | |
config: ./.github/kind-config-cluster1.yaml | |
wait: 0 # The control-plane never becomes ready, since no CNI is present | |
- name: Create Kind cluster 2 | |
uses: helm/kind-action@dda0770415bac9fc20092cacbc54aa298604d140 # v1.8.0 | |
with: | |
cluster_name: ${{ env.clusterName2 }} | |
version: ${{ env.kind_version }} | |
kubectl_version: ${{ env.k8s_version }} | |
config: ./.github/kind-config-cluster2.yaml | |
wait: 0 # The control-plane never becomes ready, since no CNI is present | |
- name: Create the IPSec secret in both clusters | |
if: matrix.encryption == 'ipsec' | |
run: | | |
SECRET="3 rfc4106(gcm(aes)) $(openssl rand -hex 20) 128" | |
kubectl --context ${{ env.contextName1 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}" | |
kubectl --context ${{ env.contextName2 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}" | |
- name: Set clustermesh connection parameters | |
id: clustermesh-vars | |
run: | | |
# Let's retrieve in advance the parameters to mesh the two clusters, so | |
# that we don't need to do that through the CLI in a second step, as it | |
# would be reset during upgrade (as we are resetting the values). | |
IP1=$(kubectl --context ${{ env.contextName1 }} get nodes \ | |
${{ env.clusterName1 }}-worker -o wide --no-headers | awk '{ print $6 }') | |
IP2=$(kubectl --context ${{ env.contextName2 }} get nodes \ | |
${{ env.clusterName2 }}-worker -o wide --no-headers | awk '{ print $6 }') | |
# Explicitly configure the NodePorts to make sure that they are different | |
# in each cluster, to workaround #24692 | |
PORT1=32379 | |
PORT2=32380 | |
# Generate the TLS certificates and explicitly configure them | |
openssl genrsa 4096 > cilium-ca-key.pem | |
openssl genrsa 4096 > cm-remote-key.pem | |
openssl req -new -x509 -nodes -days 1 -key cilium-ca-key.pem -out cilium-ca-crt.pem -subj "/CN=Cilium CA/" | |
openssl req -new -x509 -nodes -days 1 -subj "/CN=remote/" \ | |
-key cm-remote-key.pem -out cm-remote-crt.pem \ | |
-CA cilium-ca-crt.pem -CAkey cilium-ca-key.pem | |
CILIUM_INSTALL_TLS=" \ | |
--set tls.ca.cert=$(base64 -w0 cilium-ca-crt.pem) \ | |
--set tls.ca.key=$(base64 -w0 cilium-ca-key.pem) \ | |
--set clustermesh.apiserver.tls.ca.cert=$(base64 -w0 cilium-ca-crt.pem) \ | |
--set clustermesh.apiserver.tls.ca.key=$(base64 -w0 cilium-ca-key.pem) \ | |
--set clustermesh.config.clusters[0].tls.cert=$(base64 -w0 cm-remote-crt.pem) \ | |
--set clustermesh.config.clusters[0].tls.key=$(base64 -w0 cm-remote-key.pem) \ | |
" | |
CILIUM_INSTALL_CLUSTER1=" \ | |
--set cluster.name=${{ env.clusterName1 }} \ | |
--set cluster.id=1 \ | |
--set clustermesh.apiserver.service.nodePort=$PORT1 \ | |
--set clustermesh.config.clusters[0].name=${{ env.clusterName2 }} \ | |
--set clustermesh.config.clusters[0].ips={$IP2} \ | |
--set clustermesh.config.clusters[0].port=$PORT2" | |
CILIUM_INSTALL_CLUSTER2=" \ | |
--set cluster.name=${{ env.clusterName2 }} \ | |
--set cluster.id=255 \ | |
--set clustermesh.apiserver.service.nodePort=$PORT2 \ | |
--set clustermesh.config.clusters[0].name=${{ env.clusterName1 }} \ | |
--set clustermesh.config.clusters[0].ips={$IP1} \ | |
--set clustermesh.config.clusters[0].port=$PORT1" | |
echo cilium_install_cluster1="$CILIUM_INSTALL_CLUSTER1 $CILIUM_INSTALL_TLS" >> $GITHUB_OUTPUT | |
echo cilium_install_cluster2="$CILIUM_INSTALL_CLUSTER2 $CILIUM_INSTALL_TLS" >> $GITHUB_OUTPUT | |
# Warning: since this is a privileged workflow, subsequent workflow job | |
# steps must take care not to execute untrusted code. | |
- name: Checkout pull request branch (NOT TRUSTED) | |
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | |
with: | |
ref: ${{ steps.vars.outputs.sha }} | |
persist-credentials: false | |
path: untrusted/cilium-newest | |
sparse-checkout: | | |
install/kubernetes/cilium | |
- name: Checkout ${{ steps.vars.outputs.downgrade_version }} branch | |
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | |
with: | |
ref: ${{ steps.vars.outputs.downgrade_version }} | |
persist-credentials: false | |
path: untrusted/cilium-downgrade | |
sparse-checkout: | | |
install/kubernetes/cilium | |
- name: Set up downgrade settings | |
id: downgrade-vars | |
run: | | |
SHA="$(cd untrusted/cilium-downgrade && git rev-parse HEAD)" | |
CILIUM_IMAGE_SETTINGS=" \ | |
--chart-directory=./untrusted/cilium-downgrade/install/kubernetes/cilium \ | |
--set=image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${SHA} \ | |
--set=operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${SHA} \ | |
--set=clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${SHA} \ | |
" | |
echo "sha=${SHA}" >> $GITHUB_OUTPUT | |
echo "cilium_image_settings=${CILIUM_IMAGE_SETTINGS}" >> $GITHUB_OUTPUT | |
- name: Wait for images to be available (newest) | |
timeout-minutes: 10 | |
shell: bash | |
run: | | |
for image in cilium-ci operator-generic-ci hubble-relay-ci clustermesh-apiserver-ci ; do | |
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.sha }} &> /dev/null; do sleep 45s; done | |
done | |
- name: Wait for images to be available (downgrade) | |
timeout-minutes: 10 | |
shell: bash | |
run: | | |
for image in cilium-ci operator-generic-ci hubble-relay-ci clustermesh-apiserver-ci ; do | |
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.downgrade-vars.outputs.sha }} &> /dev/null; do sleep 45s; done | |
done | |
- name: Install Cilium in cluster1 | |
id: install-cilium-cluster1 | |
run: | | |
cilium --context ${{ env.contextName1 }} install \ | |
${{ steps.downgrade-vars.outputs.cilium_image_settings }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }} | |
- name: Install Cilium in cluster2 | |
run: | | |
cilium --context ${{ env.contextName2 }} install \ | |
${{ steps.vars.outputs.cilium_image_settings }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster2 }} | |
- name: Wait for cluster mesh status to be ready | |
run: | | |
cilium --context ${{ env.contextName1 }} status --wait | |
cilium --context ${{ env.contextName2 }} status --wait | |
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m | |
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m | |
- name: Make JUnit report directory | |
run: | | |
mkdir -p cilium-junits | |
- name: Run connectivity test - pre-upgrade (${{ join(matrix.*, ', ') }}) | |
run: | | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} \ | |
${{ steps.vars.outputs.connectivity_test_defaults }} \ | |
--junit-file "cilium-junits/${{ env.job_name }} - pre-upgrade (${{ join(matrix.*, ', ') }}).xml" \ | |
--junit-property github_job_step="Run tests pre-upgrade (${{ join(matrix.*, ', ') }})" | |
- name: Upgrade Cilium in cluster1 | |
run: | | |
cilium --context ${{ env.contextName1 }} upgrade --reset-values \ | |
${{ steps.vars.outputs.cilium_image_settings }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }} | |
- name: Rollout Cilium agents in cluster2 | |
run: | | |
# This makes sure that the remote agents reconnect to the new instance of the | |
# clustermesh-apiserver, without waiting for the watchdog mechanism to kick in. | |
kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium | |
- name: Wait for cluster mesh status to be ready | |
run: | | |
cilium --context ${{ env.contextName1 }} status --wait | |
cilium --context ${{ env.contextName2 }} status --wait | |
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m | |
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m | |
- name: Run connectivity test - post-upgrade (${{ join(matrix.*, ', ') }}) | |
run: | | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} \ | |
${{ steps.vars.outputs.connectivity_test_defaults }} \ | |
--junit-file "cilium-junits/${{ env.job_name }} - post upgrade (${{ join(matrix.*, ', ') }}).xml" \ | |
--junit-property github_job_step="Run tests post-upgrade (${{ join(matrix.*, ', ') }})" | |
- name: Downgrade Cilium in cluster1 | |
run: | | |
cilium --context ${{ env.contextName1 }} upgrade --reset-values \ | |
${{ steps.downgrade-vars.outputs.cilium_image_settings }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }} | |
- name: Rollout Cilium agents in cluster2 | |
run: | | |
# This makes sure that the remote agents reconnect to the new instance of the | |
# clustermesh-apiserver, without waiting for the watchdog mechanism to kick in. | |
kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium | |
- name: Wait for cluster mesh status to be ready | |
run: | | |
cilium --context ${{ env.contextName1 }} status --wait | |
cilium --context ${{ env.contextName2 }} status --wait | |
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m | |
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m | |
- name: Run connectivity test - post-downgrade (${{ join(matrix.*, ', ') }}) | |
run: | | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} \ | |
${{ steps.vars.outputs.connectivity_test_defaults }} \ | |
--junit-file "cilium-junits/${{ env.job_name }} - post downgrade (${{ join(matrix.*, ', ') }}).xml" \ | |
--junit-property github_job_step="Run tests post-downgrade (${{ join(matrix.*, ', ') }})" | |
- name: Post-test information gathering | |
if: ${{ !success() && steps.install-cilium-cluster1.outcome != 'skipped' }} | |
run: | | |
cilium --context ${{ env.contextName1 }} status | |
cilium --context ${{ env.contextName1 }} clustermesh status | |
cilium --context ${{ env.contextName2 }} status | |
cilium --context ${{ env.contextName2 }} clustermesh status | |
kubectl config use-context ${{ env.contextName1 }} | |
kubectl get pods --all-namespaces -o wide | |
cilium sysdump --output-filename cilium-sysdump-context1-final-${{ join(matrix.*, '-') }} | |
kubectl config use-context ${{ env.contextName2 }} | |
kubectl get pods --all-namespaces -o wide | |
cilium sysdump --output-filename cilium-sysdump-context2-final-${{ join(matrix.*, '-') }} | |
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently | |
- name: Upload artifacts | |
if: ${{ !success() }} | |
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 | |
with: | |
name: cilium-sysdumps-${{ matrix.name }} | |
path: cilium-sysdump-*.zip | |
- name: Upload JUnits [junit] | |
if: ${{ always() }} | |
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 | |
with: | |
name: cilium-junits-${{ matrix.name }} | |
path: cilium-junits/*.xml | |
- name: Publish Test Results As GitHub Summary | |
if: ${{ always() }} | |
uses: aanm/junit2md@332ebf0fddd34e91b03a832cfafaa826306558f9 # v0.0.3 | |
with: | |
junit-directory: "cilium-junits" | |
merge-upload: | |
if: ${{ always() }} | |
name: Merge and Upload Artifacts | |
runs-on: ubuntu-latest | |
needs: upgrade-and-downgrade | |
steps: | |
- name: Merge Sysdumps | |
if: ${{ needs.upgrade-and-downgrade.result == 'failure' }} | |
uses: actions/upload-artifact/merge@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 | |
with: | |
name: cilium-sysdumps | |
pattern: cilium-sysdumps-* | |
retention-days: 5 | |
delete-merged: true | |
continue-on-error: true | |
- name: Merge JUnits | |
uses: actions/upload-artifact/merge@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 | |
with: | |
name: cilium-junits | |
pattern: cilium-junits-* | |
retention-days: 5 | |
delete-merged: true | |
commit-status-final: | |
if: ${{ always() }} | |
name: Commit Status Final | |
needs: upgrade-and-downgrade | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set final commit status | |
uses: myrotvorets/set-commit-status-action@38f3f27c7d52fb381273e95542f07f0fba301307 # v2.0.0 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
status: ${{ needs.upgrade-and-downgrade.result }} |