From 8ec347e34e4aa87ec27fa906be3b14c989c1415d Mon Sep 17 00:00:00 2001 From: Zack Galbreath Date: Wed, 26 Jun 2024 15:43:21 -0400 Subject: [PATCH 01/37] Upgrade to gitlab.spack.io to 16.11.5 --- k8s/production/gitlab/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/production/gitlab/release.yaml b/k8s/production/gitlab/release.yaml index 0b9936909..f6eaab34a 100644 --- a/k8s/production/gitlab/release.yaml +++ b/k8s/production/gitlab/release.yaml @@ -19,7 +19,7 @@ spec: chart: spec: chart: gitlab - version: 7.11.4 # gitlab@16.11.4 + version: 7.11.5 # gitlab@16.11.5 sourceRef: kind: HelmRepository name: gitlab From 84e5b4c7666057897e8e30a04ef376e9a380bab3 Mon Sep 17 00:00:00 2001 From: Zack Galbreath Date: Thu, 27 Jun 2024 15:01:19 -0400 Subject: [PATCH 02/37] Don't use _durable_subprocess_run for commands that are allowed to fail (#901) --- .github/workflows/custom_docker_builds.yml | 2 +- images/gh-gl-sync/SpackCIBridge.py | 6 +++--- k8s/production/custom/gh-gl-sync/cron-jobs.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 641f983ea..8e53e631d 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -20,7 +20,7 @@ jobs: matrix: include: - docker-image: ./images/gh-gl-sync - image-tags: ghcr.io/spack/ci-bridge:0.0.41 + image-tags: ghcr.io/spack/ci-bridge:0.0.42 - docker-image: ./images/ci-key-clear image-tags: ghcr.io/spack/ci-key-clear:0.0.2 - docker-image: ./images/gitlab-stuckpods diff --git a/images/gh-gl-sync/SpackCIBridge.py b/images/gh-gl-sync/SpackCIBridge.py index a3eebe1d9..3610463e9 100644 --- a/images/gh-gl-sync/SpackCIBridge.py +++ b/images/gh-gl-sync/SpackCIBridge.py @@ -168,8 +168,8 @@ def list_github_prs(self): # 2) we have pushed it before, but the HEAD sha has changed since we pushed it last log_args = ["git", "log", "--pretty=%s", "gitlab/{0}".format(pr_string)] try: - merge_commit_msg = _durable_subprocess_run( - log_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).stdout + merge_commit_msg = subprocess.run( + log_args, check=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).stdout match = self.merge_msg_regex.match(merge_commit_msg.decode("utf-8")) if match and (match.group(1) == pull.head.sha or match.group(2) == pull.head.sha): print("Skip pushing {0} because GitLab already has HEAD {1}".format(pr_string, pull.head.sha)) @@ -232,7 +232,7 @@ def list_github_prs(self): backlogged = f"GitHub HEAD shas out of sync (repo={r_sha}, API={a_sha})" push = False # Check if our PR's merge base is an ancestor of the latest tested main branch commit. - elif _durable_subprocess_run( + elif subprocess.run( ["git", "merge-base", "--is-ancestor", merge_base_sha, self.latest_tested_main_commit] ).returncode == 0: print(f"{tmp_pr_branch}'s merge base IS an ancestor of latest_tested_main " diff --git a/k8s/production/custom/gh-gl-sync/cron-jobs.yaml b/k8s/production/custom/gh-gl-sync/cron-jobs.yaml index cb1675436..94134039f 100644 --- a/k8s/production/custom/gh-gl-sync/cron-jobs.yaml +++ b/k8s/production/custom/gh-gl-sync/cron-jobs.yaml @@ -16,7 +16,7 @@ spec: restartPolicy: Never containers: - name: sync - image: ghcr.io/spack/ci-bridge:0.0.41 + image: ghcr.io/spack/ci-bridge:0.0.42 imagePullPolicy: IfNotPresent resources: requests: From 0533712e9bf9f9c9075c0aed9d0fc7f720225473 Mon Sep 17 00:00:00 2001 From: Ryan Krattiger Date: Thu, 27 Jun 2024 12:29:10 -0500 Subject: [PATCH 03/37] Fix error handling in pruning script * add option to fail fast * open error/failures files for write correctly --- .../ci-prune-buildcache/ci_buildcache_prune.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/images/ci-prune-buildcache/ci_buildcache_prune.py b/images/ci-prune-buildcache/ci_buildcache_prune.py index e82dbcf15..c0c0a8600 100644 --- a/images/ci-prune-buildcache/ci_buildcache_prune.py +++ b/images/ci-prune-buildcache/ci_buildcache_prune.py @@ -229,6 +229,12 @@ def configure_parser(): help="use the buildcache index to check for buildcache hashes", action="store_true", ) + parser.add_argument( + "--fail-fast", + help="Fail immediately on an error, otherwise continue until" + "there is not more work.", + action="store_true", + ) return parser @@ -467,12 +473,12 @@ def configure_parser(): fname_template = f"{args.output_dir}/delete-{{0}}-{stack}{log_suffix}.json" if err: print(f"errors: {stack}") - with open(fname_template.format("errors")) as fd: + with open(fname_template.format("errors", "w")) as fd: helper.write_json(fd, err) if fail: print(f"failures: {stack}") - with open(fname_template.format("failures")) as fd: + with open(fname_template.format("failures", "w")) as fd: helper.write_json(fd, fail) else: print(f"-- Would have deleted of {len(lines)} from {stack} buildcache") @@ -494,8 +500,10 @@ def configure_parser(): ) except Exception as e: print(f"Error -- Skipping pruning of {stack}") - print(str(e)) - raise "ff mode" from e + if args.fail_fast: + raise e + else: + print(str(e)) finally: if prune_file: prune_file.close() From 775a57cc5dd18b847b50d71c711c6e5939d98a3b Mon Sep 17 00:00:00 2001 From: Ryan Krattiger Date: Thu, 27 Jun 2024 14:42:15 -0500 Subject: [PATCH 04/37] Update pruner image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/prune-buildcache/cron-jobs.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 641f983ea..28284a375 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -42,7 +42,7 @@ jobs: - docker-image: ./analytics image-tags: ghcr.io/spack/django:0.3.5 - docker-image: ./images/ci-prune-buildcache - image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.3 + image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish image-tags: ghcr.io/spack/protected-publish:0.0.1 steps: diff --git a/k8s/production/custom/prune-buildcache/cron-jobs.yaml b/k8s/production/custom/prune-buildcache/cron-jobs.yaml index 54bd1a593..3b2bd3820 100644 --- a/k8s/production/custom/prune-buildcache/cron-jobs.yaml +++ b/k8s/production/custom/prune-buildcache/cron-jobs.yaml @@ -17,7 +17,7 @@ spec: restartPolicy: Never containers: - name: pruner - image: ghcr.io/spack/ci-prune-buildcache:0.0.3 + image: ghcr.io/spack/ci-prune-buildcache:0.0.4 imagePullPolicy: IfNotPresent resources: requests: From 3062ad1d7d637bca376dea33faaad64284ccce4f Mon Sep 17 00:00:00 2001 From: Zack Galbreath Date: Fri, 28 Jun 2024 09:58:01 -0400 Subject: [PATCH 05/37] Add retries to gitlab requests in skipped_pipelines (#904) * Add retries to gitlab requests in skipped_pipelines * Update images/gitlab-skipped-pipelines/skipped_pipelines.py Co-authored-by: Mike VanDenburgh <37340715+mvandenburgh@users.noreply.github.com> * Update images/gitlab-skipped-pipelines/skipped_pipelines.py Co-authored-by: Mike VanDenburgh <37340715+mvandenburgh@users.noreply.github.com> --------- Co-authored-by: Mike VanDenburgh <37340715+mvandenburgh@users.noreply.github.com> --- .github/workflows/custom_docker_builds.yml | 2 +- .../skipped_pipelines.py | 30 +++++++++++++++++-- .../custom/skipped-pipelines/cron-jobs.yaml | 2 +- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 0f5655c6d..5f28f9165 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -30,7 +30,7 @@ jobs: - docker-image: ./images/gitlab-delete-stale-branches image-tags: ghcr.io/spack/gitlab-delete-stale-branches:0.0.1 - docker-image: ./images/gitlab-skipped-pipelines - image-tags: ghcr.io/spack/gitlab-skipped-pipelines:0.0.1 + image-tags: ghcr.io/spack/gitlab-skipped-pipelines:0.0.2 - docker-image: ./images/notary image-tags: ghcr.io/spack/notary:latest - docker-image: ./images/python-aws-bash diff --git a/images/gitlab-skipped-pipelines/skipped_pipelines.py b/images/gitlab-skipped-pipelines/skipped_pipelines.py index 97ae8f273..2d648cabd 100644 --- a/images/gitlab-skipped-pipelines/skipped_pipelines.py +++ b/images/gitlab-skipped-pipelines/skipped_pipelines.py @@ -6,7 +6,8 @@ import urllib.parse from datetime import datetime, timedelta, timezone -import requests +from requests import Session +from requests.adapters import HTTPAdapter, Retry import sentry_sdk sentry_sdk.init( @@ -21,13 +22,30 @@ "PRIVATE-TOKEN": os.environ.get("GITLAB_TOKEN", None) } +session = Session() +session.mount( + "https://", + HTTPAdapter( + max_retries=Retry( + total=5, + backoff_factor=2, + backoff_jitter=1, + ), + ), +) + def paginate(query_url): """Helper method to get all pages of paginated query results""" results = [] while query_url: - resp = requests.get(query_url, headers=AUTH_HEADER) + try: + resp = session.get(query_url, headers=AUTH_HEADER, timeout=10) + except OSError as e: + print(f"Request to {query_url} failed") + sentry_sdk.capture_exception(e) + return [] if resp.status_code == 401: print(" !!! Unauthorized to make request, check GITLAB_TOKEN !!!") @@ -59,7 +77,13 @@ def run_new_pipeline(pipeline_ref): enc_ref = urllib.parse.quote_plus(pipeline_ref) run_url = f"{GITLAB_API_URL}/pipeline?ref={enc_ref}" print(f" !!!! running new pipeline for {pipeline_ref}") - print_response(requests.post(run_url, headers=AUTH_HEADER), " ") + try: + resp = session.post(run_url, headers=AUTH_HEADER, timeout=10) + except OSError as e: + print(f"Request to {run_url} failed") + sentry_sdk.capture_exception(e) + return None + print_response(resp, " ") def find_and_run_skipped_pipelines(): diff --git a/k8s/production/custom/skipped-pipelines/cron-jobs.yaml b/k8s/production/custom/skipped-pipelines/cron-jobs.yaml index a07cd9b84..5ee02d082 100644 --- a/k8s/production/custom/skipped-pipelines/cron-jobs.yaml +++ b/k8s/production/custom/skipped-pipelines/cron-jobs.yaml @@ -14,7 +14,7 @@ spec: restartPolicy: Never containers: - name: skipped-pipelines - image: ghcr.io/spack/gitlab-skipped-pipelines:0.0.1 + image: ghcr.io/spack/gitlab-skipped-pipelines:0.0.2 imagePullPolicy: IfNotPresent resources: requests: From 24a908cb287110f4b5dc6107696f22539454937d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:46:18 +0000 Subject: [PATCH 06/37] [gh-actions](deps): Bump docker/setup-qemu-action from 3.0.0 to 3.1.0 Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3.0.0 to 3.1.0. - [Release notes](https://github.com/docker/setup-qemu-action/releases) - [Commits](https://github.com/docker/setup-qemu-action/compare/68827325e0b33c7199eb31dd4e31fbe9023e06e3...5927c834f5b4fdf503fca6f4c7eccda82949e1ee) --- updated-dependencies: - dependency-name: docker/setup-qemu-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/custom_docker_builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 5f28f9165..4c4a3b2ba 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -50,7 +50,7 @@ jobs: uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 - name: Set up QEMU - uses: docker/setup-qemu-action@68827325e0b33c7199eb31dd4e31fbe9023e06e3 # v3.0.0 + uses: docker/setup-qemu-action@5927c834f5b4fdf503fca6f4c7eccda82949e1ee # v3.1.0 - name: Set up Docker Buildx uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # v3.3.0 From e594a6e886b86a6c083eeea477bb2fcc9131b511 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:46:27 +0000 Subject: [PATCH 07/37] [gh-actions](deps): Bump docker/setup-buildx-action from 3.3.0 to 3.4.0 Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3.3.0 to 3.4.0. - [Release notes](https://github.com/docker/setup-buildx-action/releases) - [Commits](https://github.com/docker/setup-buildx-action/compare/d70bba72b1f3fd22344832f00baa16ece964efeb...4fd812986e6c8c2a69e18311145f9371337f27d4) --- updated-dependencies: - dependency-name: docker/setup-buildx-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/custom_docker_builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 5f28f9165..e5a9b811d 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -53,7 +53,7 @@ jobs: uses: docker/setup-qemu-action@68827325e0b33c7199eb31dd4e31fbe9023e06e3 # v3.0.0 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # v3.3.0 + uses: docker/setup-buildx-action@4fd812986e6c8c2a69e18311145f9371337f27d4 # v3.4.0 - name: Log in to the Container registry uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446 # v3.2.0 From 50f4f0a0fdf67fbabb46cc164062fe9ab1373202 Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 8 Jul 2024 13:27:11 -0400 Subject: [PATCH 08/37] Add karpenter label to prometheus node query --- analytics/analytics/job_processor/prometheus.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/analytics/analytics/job_processor/prometheus.py b/analytics/analytics/job_processor/prometheus.py index 531efb206..d694beda8 100644 --- a/analytics/analytics/job_processor/prometheus.py +++ b/analytics/analytics/job_processor/prometheus.py @@ -368,9 +368,11 @@ def get_pod_node_data(self, pod: str, start: datetime, end: datetime) -> NodeDat )["metric"]["system_uuid"] ) - # Get node labels + # Get node labels. Include the karpenter label to prevent the results being split up + # into two sets (one before this label was added and one after). This can occur if + # the job is scheduled on a newly created node node_labels = self.query_range( - f"kube_node_labels{{node='{node_name}'}}", + f"kube_node_labels{{node='{node_name}', label_karpenter_sh_initialized='true'}}", start=start, end=end, single_result=True, From bf10a40412c627339e4dc94e465b81385e4fd82d Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 8 Jul 2024 14:08:43 -0400 Subject: [PATCH 09/37] Bump django image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 5f28f9165..a320dcda5 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -40,7 +40,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.5 + image-tags: ghcr.io/spack/django:0.3.6 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index 1cd075907..16c8d1b6c 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.5 + image: ghcr.io/spack/django:0.3.6 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.5 + image: ghcr.io/spack/django:0.3.6 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From 05252c25c09f14df79eb748355dfae9ab44c631f Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Tue, 9 Jul 2024 13:02:46 -0400 Subject: [PATCH 10/37] Add another filter to node label query --- analytics/analytics/job_processor/prometheus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analytics/analytics/job_processor/prometheus.py b/analytics/analytics/job_processor/prometheus.py index d694beda8..df13db819 100644 --- a/analytics/analytics/job_processor/prometheus.py +++ b/analytics/analytics/job_processor/prometheus.py @@ -368,11 +368,11 @@ def get_pod_node_data(self, pod: str, start: datetime, end: datetime) -> NodeDat )["metric"]["system_uuid"] ) - # Get node labels. Include the karpenter label to prevent the results being split up + # Get node labels. Include extra labels to prevent the results being split up # into two sets (one before this label was added and one after). This can occur if # the job is scheduled on a newly created node node_labels = self.query_range( - f"kube_node_labels{{node='{node_name}', label_karpenter_sh_initialized='true'}}", + f"kube_node_labels{{node='{node_name}', label_karpenter_sh_initialized='true', label_topology_ebs_csi_aws_com_zone=~'.+'}}", start=start, end=end, single_result=True, From eb5b62b6d6a516bf5f7d2a3fc6aeab083d6d1f6c Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Tue, 9 Jul 2024 13:09:00 -0400 Subject: [PATCH 11/37] Bump django image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index a320dcda5..fdbab5d2e 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -40,7 +40,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.6 + image-tags: ghcr.io/spack/django:0.3.7 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index 16c8d1b6c..587edcba0 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.6 + image: ghcr.io/spack/django:0.3.7 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.6 + image: ghcr.io/spack/django:0.3.7 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From 4e6dc8c2a32da3a5e39957e1f542cbe2130fbf5f Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 8 Jul 2024 17:12:07 -0400 Subject: [PATCH 12/37] Add CI workflow to check for missing image version bumps --- .../scripts/check_docker_image_versions.sh | 51 +++++++++++++++++++ .../workflows/check_docker_image_versions.yml | 17 +++++++ 2 files changed, 68 insertions(+) create mode 100755 .github/scripts/check_docker_image_versions.sh create mode 100644 .github/workflows/check_docker_image_versions.yml diff --git a/.github/scripts/check_docker_image_versions.sh b/.github/scripts/check_docker_image_versions.sh new file mode 100755 index 000000000..8ce47e734 --- /dev/null +++ b/.github/scripts/check_docker_image_versions.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +echoerr() { printf "%s\n" "$*" >&2; } + +WORKFLOW_FILE=./.github/workflows/custom_docker_builds.yml + +# Set to 1 if any of the checks fail +FAILED=0 + +GIT_DIFF='git diff origin/main HEAD' + +# What gets fed into the $image var is defined at the end of the loop +while read image; do + DOCKER_IMAGE_DIR=$(echo $image | jq '."docker-image"' -r | sed 's/^\.\///') + IMAGE_TAG=$(echo $image | jq '."image-tags"' -r) + + # Skip if the directory was not modified at all + if ! $GIT_DIFF --name-only | grep $DOCKER_IMAGE_DIR > /dev/null; then + continue + fi + + # Is the found tag in the added lines of the diff? If so, don't error just yet. + # If not, error, as that means the tag we're looking at is the old tag + if ! $GIT_DIFF -- $WORKFLOW_FILE | grep "^+[^+].\+$IMAGE_TAG" > /dev/null; then + FAILED=1 + echoerr "ERROR: Directory '$DOCKER_IMAGE_DIR' modified, but image tag $IMAGE_TAG not incremented!" + continue + fi + + # Find the old tag from the diff and search for it. If it exists, error, as that means it hasn't been bumped + BASE_IMAGE_TAG=$(echo $IMAGE_TAG | cut -d ":" -f1) + BASE_IMAGE_TAG_PATTERN=$(echo $BASE_IMAGE_TAG | sed 's/[.\/]/\\&/g') + OLD_TAG=$($GIT_DIFF -- $WORKFLOW_FILE | sed -nr s"/^-[^-].+($BASE_IMAGE_TAG_PATTERN)/\1/p") + + NEW_TAG_VERSION=$(echo $IMAGE_TAG | cut -d ":" -f2) + OLD_TAG_VERSION=$(echo $OLD_TAG | cut -d ":" -f2) + + # Search for this old tag. If found error, as we should only find the new tag + if git grep $OLD_TAG > /dev/null; then + FAILED=1 + echoerr "ERROR: Image $BASE_IMAGE_TAG incremented to $NEW_TAG_VERSION, found remaining occurances of $OLD_TAG_VERSION!" + fi + +# This is where the input to the while loop variable $image comes in. This is called a "here string" and +# circumvents the issue with subshells setting global variables. +# https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Here-Strings +done <<< $(cat $WORKFLOW_FILE | yq ".jobs.build.strategy.matrix.include" -o json | jq -c ".[]") + +if [ "$FAILED" -eq "1" ]; then + exit 1 +fi diff --git a/.github/workflows/check_docker_image_versions.yml b/.github/workflows/check_docker_image_versions.yml new file mode 100644 index 000000000..f5c6ec99b --- /dev/null +++ b/.github/workflows/check_docker_image_versions.yml @@ -0,0 +1,17 @@ +name: Check Docker Image Versions + +on: + pull_request: + +jobs: + check-image: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-image-bump') }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - run: git fetch --no-tags --prune --depth=1 origin main + + - name: Check for modified directories that need an image bump + run: ./.github/scripts/check_docker_image_versions.sh From e1a7fac273909ddf374c1d2b6d5efa7ef928cf69 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Wed, 10 Jul 2024 20:11:03 -0400 Subject: [PATCH 13/37] Upgrade Karpenter to 0.31.0 --- terraform/modules/spack/karpenter.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/spack/karpenter.tf b/terraform/modules/spack/karpenter.tf index 4eb5dc7df..ccb830820 100644 --- a/terraform/modules/spack/karpenter.tf +++ b/terraform/modules/spack/karpenter.tf @@ -1,5 +1,5 @@ locals { - karpenter_version = "v0.29.0" + karpenter_version = "v0.31.0" } module "karpenter" { From 8c27a6a0679cbd7595d26dbc5234fd5ee2bebb39 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Wed, 10 Jul 2024 20:12:20 -0400 Subject: [PATCH 14/37] Upgrade Karpenter to 0.31.4 --- terraform/modules/spack/karpenter.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/spack/karpenter.tf b/terraform/modules/spack/karpenter.tf index ccb830820..183cefcf3 100644 --- a/terraform/modules/spack/karpenter.tf +++ b/terraform/modules/spack/karpenter.tf @@ -1,5 +1,5 @@ locals { - karpenter_version = "v0.31.0" + karpenter_version = "v0.31.4" } module "karpenter" { From 35fc4e39a3df65fd62c0c4edf7e93b91f474e74b Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 11 Jul 2024 12:41:36 -0400 Subject: [PATCH 15/37] Upgrade TF `karpenter` module to 19.21.0 --- terraform/modules/spack/karpenter.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/spack/karpenter.tf b/terraform/modules/spack/karpenter.tf index 183cefcf3..63e82f2f7 100644 --- a/terraform/modules/spack/karpenter.tf +++ b/terraform/modules/spack/karpenter.tf @@ -4,7 +4,7 @@ locals { module "karpenter" { source = "terraform-aws-modules/eks/aws//modules/karpenter" - version = "18.31.0" + version = "19.21.0" cluster_name = module.eks.cluster_name From d62621f4562d3505eed435003eacc57a247af452 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 11 Jul 2024 15:04:02 -0400 Subject: [PATCH 16/37] Upgrade to GitLab 16.11.6 --- k8s/production/gitlab/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/production/gitlab/release.yaml b/k8s/production/gitlab/release.yaml index f6eaab34a..82a17e98d 100644 --- a/k8s/production/gitlab/release.yaml +++ b/k8s/production/gitlab/release.yaml @@ -19,7 +19,7 @@ spec: chart: spec: chart: gitlab - version: 7.11.5 # gitlab@16.11.5 + version: 7.11.6 # gitlab@16.11.6 sourceRef: kind: HelmRepository name: gitlab From 0c44e5c55f04473a0d7e425111af13b411c28822 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 13:21:55 +0000 Subject: [PATCH 17/37] [gh-actions](deps): Bump docker/build-push-action from 6.1.0 to 6.4.0 Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 6.1.0 to 6.4.0. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/31159d49c0d4756269a0940a750801a1ea5d7003...a254f8ca60a858f3136a2f1f23a60969f2c402dd) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/custom_docker_builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index fdbab5d2e..ca125b863 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -64,7 +64,7 @@ jobs: - name: Build ${{ (github.ref == 'refs/heads/main' && 'and push ') || '' }}${{ matrix.docker-image }} id: docker-build-push - uses: docker/build-push-action@31159d49c0d4756269a0940a750801a1ea5d7003 # v6.1.0 + uses: docker/build-push-action@a254f8ca60a858f3136a2f1f23a60969f2c402dd # v6.4.0 with: context: ${{ matrix.docker-image }} file: ${{ matrix.docker-image }}/Dockerfile From 783f2bd1c5aacdcbd067eb15b985157958a87806 Mon Sep 17 00:00:00 2001 From: Flux <> Date: Mon, 15 Jul 2024 13:51:15 -0400 Subject: [PATCH 18/37] Update Flux --- k8s/staging/flux-system/gotk-components.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/k8s/staging/flux-system/gotk-components.yaml b/k8s/staging/flux-system/gotk-components.yaml index 4be71f347..cf85dffca 100644 --- a/k8s/staging/flux-system/gotk-components.yaml +++ b/k8s/staging/flux-system/gotk-components.yaml @@ -2639,7 +2639,7 @@ spec: serviceAccountName: helm-controller terminationGracePeriodSeconds: 600 tolerations: - - key: SpackBootstrap + - key: CriticalAddonsOnly operator: Exists volumes: - emptyDir: {} @@ -4371,7 +4371,7 @@ spec: serviceAccountName: kustomize-controller terminationGracePeriodSeconds: 60 tolerations: - - key: SpackBootstrap + - key: CriticalAddonsOnly operator: Exists volumes: - emptyDir: {} @@ -6289,7 +6289,7 @@ spec: serviceAccountName: notification-controller terminationGracePeriodSeconds: 10 tolerations: - - key: SpackBootstrap + - key: CriticalAddonsOnly operator: Exists volumes: - emptyDir: {} @@ -9684,7 +9684,7 @@ spec: serviceAccountName: source-controller terminationGracePeriodSeconds: 10 tolerations: - - key: SpackBootstrap + - key: CriticalAddonsOnly operator: Exists volumes: - emptyDir: {} From 3fb4b1a2decdc0bc50a00e0407a2213147b383ac Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 15 Jul 2024 17:32:27 -0400 Subject: [PATCH 19/37] Average node price over job lifetime --- .../analytics/job_processor/prometheus.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/analytics/analytics/job_processor/prometheus.py b/analytics/analytics/job_processor/prometheus.py index df13db819..a4d01e86c 100644 --- a/analytics/analytics/job_processor/prometheus.py +++ b/analytics/analytics/job_processor/prometheus.py @@ -386,19 +386,21 @@ def get_pod_node_data(self, pod: str, start: datetime, end: datetime) -> NodeDat capacity_type = node_labels["label_karpenter_sh_capacity_type"] instance_type = node_labels["label_node_kubernetes_io_instance_type"] - # Retrieve the price of this node + # Retrieve the price of this node. Since this price can change in the middle of this job's + # lifetime, we return all values from this query and average them. zone = node_labels["label_topology_kubernetes_io_zone"] - spot_price = float( - self.query_range( - "karpenter_cloudprovider_instance_type_price_estimate{" - f"capacity_type='{capacity_type}'," - f"instance_type='{instance_type}'," - f"zone='{zone}'" - "}", - start=start, - end=end, - single_result=True, - )["values"][0][1] + spot_prices_result = self.query_range( + f""" + karpenter_cloudprovider_instance_type_price_estimate{{ + capacity_type='{capacity_type}', + instance_type='{instance_type}', + zone='{zone}' + }}""", + start=start, + end=end, + ) + spot_price = statistics.mean( + [float(val[1]) for result in spot_prices_result for val in result["values"]] ) # Save and set as job node From 60e8396f3face24ee809eed4e8e6adb0ae9e5408 Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 15 Jul 2024 17:35:50 -0400 Subject: [PATCH 20/37] Bump django image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index fdbab5d2e..ac2e2b046 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -40,7 +40,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.7 + image-tags: ghcr.io/spack/django:0.3.8 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index 587edcba0..d14b0492a 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.7 + image: ghcr.io/spack/django:0.3.8 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.7 + image: ghcr.io/spack/django:0.3.8 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From 05608f23d01202354f68026a6cf966adad99dacf Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Wed, 17 Jul 2024 12:56:32 -0400 Subject: [PATCH 21/37] Fix search patterns - Add ^ to front of regex for directory changes - Escape all strings used for search --- .github/scripts/check_docker_image_versions.sh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/scripts/check_docker_image_versions.sh b/.github/scripts/check_docker_image_versions.sh index 8ce47e734..f4eef83e7 100755 --- a/.github/scripts/check_docker_image_versions.sh +++ b/.github/scripts/check_docker_image_versions.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash echoerr() { printf "%s\n" "$*" >&2; } +escapestr() { sed -e 's/[.\/]/\\&/g'; } WORKFLOW_FILE=./.github/workflows/custom_docker_builds.yml @@ -12,16 +13,18 @@ GIT_DIFF='git diff origin/main HEAD' # What gets fed into the $image var is defined at the end of the loop while read image; do DOCKER_IMAGE_DIR=$(echo $image | jq '."docker-image"' -r | sed 's/^\.\///') - IMAGE_TAG=$(echo $image | jq '."image-tags"' -r) + DOCKER_IMAGE_DIR_PATTERN=$(echo $DOCKER_IMAGE_DIR | escapestr) # Skip if the directory was not modified at all - if ! $GIT_DIFF --name-only | grep $DOCKER_IMAGE_DIR > /dev/null; then + if ! $GIT_DIFF --name-only | grep "^$DOCKER_IMAGE_DIR_PATTERN" > /dev/null; then continue fi # Is the found tag in the added lines of the diff? If so, don't error just yet. # If not, error, as that means the tag we're looking at is the old tag - if ! $GIT_DIFF -- $WORKFLOW_FILE | grep "^+[^+].\+$IMAGE_TAG" > /dev/null; then + IMAGE_TAG=$(echo $image | jq '."image-tags"' -r) + IMAGE_TAG_PATTERN=$(echo $IMAGE_TAG | escapestr) + if ! $GIT_DIFF -- $WORKFLOW_FILE | grep "^+[^+].\+$IMAGE_TAG_PATTERN" > /dev/null; then FAILED=1 echoerr "ERROR: Directory '$DOCKER_IMAGE_DIR' modified, but image tag $IMAGE_TAG not incremented!" continue @@ -29,14 +32,15 @@ while read image; do # Find the old tag from the diff and search for it. If it exists, error, as that means it hasn't been bumped BASE_IMAGE_TAG=$(echo $IMAGE_TAG | cut -d ":" -f1) - BASE_IMAGE_TAG_PATTERN=$(echo $BASE_IMAGE_TAG | sed 's/[.\/]/\\&/g') + BASE_IMAGE_TAG_PATTERN=$(echo $BASE_IMAGE_TAG | escapestr) OLD_TAG=$($GIT_DIFF -- $WORKFLOW_FILE | sed -nr s"/^-[^-].+($BASE_IMAGE_TAG_PATTERN)/\1/p") NEW_TAG_VERSION=$(echo $IMAGE_TAG | cut -d ":" -f2) OLD_TAG_VERSION=$(echo $OLD_TAG | cut -d ":" -f2) # Search for this old tag. If found error, as we should only find the new tag - if git grep $OLD_TAG > /dev/null; then + OLD_TAG_PATTERN=$(echo $OLD_TAG | escapestr) + if git grep $OLD_TAG_PATTERN > /dev/null; then FAILED=1 echoerr "ERROR: Image $BASE_IMAGE_TAG incremented to $NEW_TAG_VERSION, found remaining occurances of $OLD_TAG_VERSION!" fi From 5b231342ed4fd0f8f48618d3e0c32375482d6cb4 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 18 Jul 2024 11:32:06 -0400 Subject: [PATCH 22/37] Add new regex to `concretizer_error` taxonomy Jobs like [this one](https://gitlab.spack.io/spack/spack/-/jobs/11963760) are getting categorized as `other`, when they should be `concretizer_error`. This adds a new regex to the taxonomy fix this. --- analytics/analytics/core/job_failure_classifier/taxonomy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml index a6d3c448e..6dc079246 100644 --- a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml +++ b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml @@ -24,6 +24,7 @@ taxonomy: - "Error: errors occurred during concretization" - "Error: concretization failed for the following reasons" - "Spack concretizer internal error." + - "failed to concretize .+ for the following reasons" job_log_missing: grep_for: From bd1cc12487420671377bbe9939209a9c96bd018d Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 18 Jul 2024 11:32:22 -0400 Subject: [PATCH 23/37] Bump image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index ac2e2b046..632a733db 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -40,7 +40,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.8 + image-tags: ghcr.io/spack/django:0.3.9 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index d14b0492a..3d7fc1b0a 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.8 + image: ghcr.io/spack/django:0.3.9 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.8 + image: ghcr.io/spack/django:0.3.9 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From 4ca7f856c9ab7dd6d2e4f63a6bddca0d4782451c Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 18 Jul 2024 12:13:19 -0400 Subject: [PATCH 24/37] Add new regex to `concretizer_error` taxonomy Jobs like [this one](https://gitlab.spack.io/spack/spack/-/jobs/11966614) are getting categorized as `other`, when they should be `concretizer_error`. This adds a new regex to the taxonomy fix this. --- analytics/analytics/core/job_failure_classifier/taxonomy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml index 6dc079246..1d8ac10d0 100644 --- a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml +++ b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml @@ -25,6 +25,7 @@ taxonomy: - "Error: concretization failed for the following reasons" - "Spack concretizer internal error." - "failed to concretize .+ for the following reasons" + - "variant .+ not found in package" job_log_missing: grep_for: From e2055996fdb3dc1691526113daea8c287781278a Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 18 Jul 2024 12:15:04 -0400 Subject: [PATCH 25/37] Bump image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 632a733db..df7d4a949 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -40,7 +40,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.9 + image-tags: ghcr.io/spack/django:0.3.10 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index 3d7fc1b0a..43c7f8c27 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.9 + image: ghcr.io/spack/django:0.3.10 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.9 + image: ghcr.io/spack/django:0.3.10 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From dc6d0362f7ee1386d0e190be203f399e73a48cbf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Jul 2024 22:42:51 +0000 Subject: [PATCH 26/37] Bump django from 4.2.9 to 4.2.14 in /analytics Bumps [django](https://github.com/django/django) from 4.2.9 to 4.2.14. - [Commits](https://github.com/django/django/compare/4.2.9...4.2.14) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- analytics/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analytics/requirements.txt b/analytics/requirements.txt index 7c98058d5..a6da0d7d9 100644 --- a/analytics/requirements.txt +++ b/analytics/requirements.txt @@ -21,7 +21,7 @@ decorator==5.1.1 distlib==0.3.8 dj-database-url==2.1.0 dj-email-url==1.0.6 -Django==4.2.9 +Django==4.2.14 django-click==2.3.0 django-configurations==2.5 django-cors-headers==4.3.1 From b6265cb3797e9e44c9c30c6671294afbd61830ea Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 18 Jul 2024 15:20:10 -0400 Subject: [PATCH 27/37] Bump Django image --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 1a0615622..718e6ab65 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -40,7 +40,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.10 + image-tags: ghcr.io/spack/django:0.3.11 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index 43c7f8c27..d610beba9 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.10 + image: ghcr.io/spack/django:0.3.11 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.10 + image: ghcr.io/spack/django:0.3.11 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From 9a8cd05d6312d600eb438804fe8db4d48550c37c Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Thu, 18 Jul 2024 15:21:22 -0400 Subject: [PATCH 28/37] Add missing directory to docker CI check Modifying the `analytics` directory should trigger a test build of the docker images. --- .github/workflows/custom_docker_builds.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 718e6ab65..2b870c1ea 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -5,10 +5,12 @@ on: branches: - main paths: + - analytics/** - images/** - .github/workflows/custom_docker_builds.yml pull_request: paths: + - analytics/** - images/** - .github/workflows/custom_docker_builds.yml From 4c7657041ec426cac22e4f26813f093582b130df Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Fri, 19 Jul 2024 13:08:07 -0400 Subject: [PATCH 29/37] Add new error taxonomy for missing spec list This correctly categorizes the error that occurs when a `rebuild-index` job fails due to a missing spec list, which happens when all dependent jobs fail. Currently, that error is getting labeled as `other`. Example jobs: https://gitlab.spack.io/spack/spack/-/jobs/11980518 https://gitlab.spack.io/spack/spack/-/jobs/11980519 https://gitlab.spack.io/spack/spack/-/jobs/11980551 --- .../analytics/core/job_failure_classifier/taxonomy.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml index 1d8ac10d0..3c032a345 100644 --- a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml +++ b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml @@ -56,6 +56,10 @@ taxonomy: grep_for: - 'SpackError: No installed spec matches the hash' + failed_to_get_specs: + grep_for: + - 'Error: Unable to generate package index: Failed to get list of specs from' + build_error: grep_for: - 'error found in build log:' @@ -256,6 +260,7 @@ taxonomy: - 'db_match' - 'db_hash' - 'no_spec' + - 'failed_to_get_specs' - 'ref_not_found' - 'cmd_not_found' - 'module_not_found' From 751492266b4309a6f57a1a448f463c1e13cbd330 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Fri, 19 Jul 2024 13:11:16 -0400 Subject: [PATCH 30/37] Bump image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 2b870c1ea..eac8b4bc9 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -42,7 +42,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.11 + image-tags: ghcr.io/spack/django:0.3.12 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index d610beba9..9ef7d53e8 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.11 + image: ghcr.io/spack/django:0.3.12 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.11 + image: ghcr.io/spack/django:0.3.12 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From 6a315cd841c26eb35d338931c4420c561c846b5a Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Wed, 24 Jul 2024 16:06:49 -0400 Subject: [PATCH 31/37] Add new regex for `concretization_error` This regex will catch jobs like this one, https://gitlab.spack.io/spack/spack/-/jobs/12007967, which currently gets categorized as `other` --- analytics/analytics/core/job_failure_classifier/taxonomy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml index 3c032a345..a6d1e4f79 100644 --- a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml +++ b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml @@ -26,6 +26,7 @@ taxonomy: - "Spack concretizer internal error." - "failed to concretize .+ for the following reasons" - "variant .+ not found in package" + - "trying to set variant .+ in package .+, but the package has no such variant" job_log_missing: grep_for: From 11573919416b2ba121280930d5d15fb043fd158a Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Wed, 24 Jul 2024 16:08:12 -0400 Subject: [PATCH 32/37] Bump image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index eac8b4bc9..693f6e357 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -42,7 +42,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.12 + image-tags: ghcr.io/spack/django:0.3.13 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index 9ef7d53e8..afe686d69 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.12 + image: ghcr.io/spack/django:0.3.13 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.12 + image: ghcr.io/spack/django:0.3.13 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From b538cb4855d3513f0699d8d62a4c30cac755f88c Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Mon, 29 Jul 2024 21:14:27 -0400 Subject: [PATCH 33/37] Add new regex for `spack_error` taxonomy Catches jobs like these, which were labeled as `other`: https://gitlab.spack.io/spack/spack/-/jobs/12024871 https://gitlab.spack.io/spack/spack/-/jobs/12024740 https://gitlab.spack.io/spack/spack/-/jobs/12024707 --- analytics/analytics/core/job_failure_classifier/taxonomy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml index a6d1e4f79..47eac5218 100644 --- a/analytics/analytics/core/job_failure_classifier/taxonomy.yaml +++ b/analytics/analytics/core/job_failure_classifier/taxonomy.yaml @@ -162,6 +162,7 @@ taxonomy: - 'Error: No version for .+ satisfies' - 'Error: errors occurred during concretization of the environment' - 'cannot load package .+ from the .builtin. repository' + - 'must have a default provider in /builds/spack/spack/etc/spack/defaults/packages.yaml' invalid_pipeline_yaml: grep_for: From 9aaf0fe0999a7211125a3e86b482a373809612c1 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Mon, 29 Jul 2024 21:15:05 -0400 Subject: [PATCH 34/37] Bump image version --- .github/workflows/custom_docker_builds.yml | 2 +- k8s/production/custom/webhook-handler/deployments.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 693f6e357..b5d4d66f8 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -42,7 +42,7 @@ jobs: - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics - image-tags: ghcr.io/spack/django:0.3.13 + image-tags: ghcr.io/spack/django:0.3.14 - docker-image: ./images/ci-prune-buildcache image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index afe686d69..a46315915 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.3.13 + image: ghcr.io/spack/django:0.3.14 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.3.13 + image: ghcr.io/spack/django:0.3.14 command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"] imagePullPolicy: Always resources: From 74e23d2b1a57d1a41ceec7ea5b1556dfb6c1b8e6 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Tue, 30 Jul 2024 09:46:39 -0400 Subject: [PATCH 35/37] Upgrade metabase to v0.50.17 --- k8s/production/metabase/metabase-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/production/metabase/metabase-deployment.yaml b/k8s/production/metabase/metabase-deployment.yaml index 8003ad651..a9e696ebd 100644 --- a/k8s/production/metabase/metabase-deployment.yaml +++ b/k8s/production/metabase/metabase-deployment.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: metabase - image: metabase/metabase:v0.50.5 + image: metabase/metabase:v0.50.17 imagePullPolicy: "IfNotPresent" resources: requests: From dbca052ea5aa29442a4d0556df5d5a02f3ae77a9 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Tue, 30 Jul 2024 09:51:25 -0400 Subject: [PATCH 36/37] Upgrade ingress-nginx to v1.11.1 --- k8s/production/ingress-nginx/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/production/ingress-nginx/release.yaml b/k8s/production/ingress-nginx/release.yaml index 17539eb30..c14d17e91 100644 --- a/k8s/production/ingress-nginx/release.yaml +++ b/k8s/production/ingress-nginx/release.yaml @@ -19,7 +19,7 @@ spec: chart: spec: chart: ingress-nginx - version: 4.10.1 # ingress-nginx@1.10.1 + version: 4.11.1 # ingress-nginx@1.11.1 sourceRef: kind: HelmRepository name: ingress-nginx From 4abe34d1a874884e4b51245e4f36d731bf517d83 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Tue, 30 Jul 2024 09:59:48 -0400 Subject: [PATCH 37/37] Upgrade kube-prometheus-stack to v61.6.0 --- k8s/production/prometheus/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/production/prometheus/release.yaml b/k8s/production/prometheus/release.yaml index eb53c50e5..bd23fa443 100644 --- a/k8s/production/prometheus/release.yaml +++ b/k8s/production/prometheus/release.yaml @@ -20,7 +20,7 @@ spec: chart: spec: chart: kube-prometheus-stack - version: 60.2.0 # prometheus-operator@v0.74.0, grafana@8.0.* + version: 61.6.0 # prometheus-operator@v0.75.2, grafana@8.3.* sourceRef: kind: HelmRepository name: kube-prometheus-stack