Skip to content

Commit

Permalink
Merge branch 'main' into adjust-webhook-handler-resources
Browse files Browse the repository at this point in the history
  • Loading branch information
mvandenburgh authored Jul 31, 2024
2 parents e5c2f2b + 32b9f5a commit 4747d39
Show file tree
Hide file tree
Showing 19 changed files with 166 additions and 47 deletions.
55 changes: 55 additions & 0 deletions .github/scripts/check_docker_image_versions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env bash

echoerr() { printf "%s\n" "$*" >&2; }
escapestr() { sed -e 's/[.\/]/\\&/g'; }

WORKFLOW_FILE=./.github/workflows/custom_docker_builds.yml

# Set to 1 if any of the checks fail
FAILED=0

GIT_DIFF='git diff origin/main HEAD'

# What gets fed into the $image var is defined at the end of the loop
while read image; do
DOCKER_IMAGE_DIR=$(echo $image | jq '."docker-image"' -r | sed 's/^\.\///')
DOCKER_IMAGE_DIR_PATTERN=$(echo $DOCKER_IMAGE_DIR | escapestr)

# Skip if the directory was not modified at all
if ! $GIT_DIFF --name-only | grep "^$DOCKER_IMAGE_DIR_PATTERN" > /dev/null; then
continue
fi

# Is the found tag in the added lines of the diff? If so, don't error just yet.
# If not, error, as that means the tag we're looking at is the old tag
IMAGE_TAG=$(echo $image | jq '."image-tags"' -r)
IMAGE_TAG_PATTERN=$(echo $IMAGE_TAG | escapestr)
if ! $GIT_DIFF -- $WORKFLOW_FILE | grep "^+[^+].\+$IMAGE_TAG_PATTERN" > /dev/null; then
FAILED=1
echoerr "ERROR: Directory '$DOCKER_IMAGE_DIR' modified, but image tag $IMAGE_TAG not incremented!"
continue
fi

# Find the old tag from the diff and search for it. If it exists, error, as that means it hasn't been bumped
BASE_IMAGE_TAG=$(echo $IMAGE_TAG | cut -d ":" -f1)
BASE_IMAGE_TAG_PATTERN=$(echo $BASE_IMAGE_TAG | escapestr)
OLD_TAG=$($GIT_DIFF -- $WORKFLOW_FILE | sed -nr s"/^-[^-].+($BASE_IMAGE_TAG_PATTERN)/\1/p")

NEW_TAG_VERSION=$(echo $IMAGE_TAG | cut -d ":" -f2)
OLD_TAG_VERSION=$(echo $OLD_TAG | cut -d ":" -f2)

# Search for this old tag. If found error, as we should only find the new tag
OLD_TAG_PATTERN=$(echo $OLD_TAG | escapestr)
if git grep $OLD_TAG_PATTERN > /dev/null; then
FAILED=1
echoerr "ERROR: Image $BASE_IMAGE_TAG incremented to $NEW_TAG_VERSION, found remaining occurances of $OLD_TAG_VERSION!"
fi

# This is where the input to the while loop variable $image comes in. This is called a "here string" and
# circumvents the issue with subshells setting global variables.
# https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Here-Strings
done <<< $(cat $WORKFLOW_FILE | yq ".jobs.build.strategy.matrix.include" -o json | jq -c ".[]")

if [ "$FAILED" -eq "1" ]; then
exit 1
fi
17 changes: 17 additions & 0 deletions .github/workflows/check_docker_image_versions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: Check Docker Image Versions

on:
pull_request:

jobs:
check-image:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-image-bump') }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- run: git fetch --no-tags --prune --depth=1 origin main

- name: Check for modified directories that need an image bump
run: ./.github/scripts/check_docker_image_versions.sh
16 changes: 9 additions & 7 deletions .github/workflows/custom_docker_builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ on:
branches:
- main
paths:
- analytics/**
- images/**
- .github/workflows/custom_docker_builds.yml
pull_request:
paths:
- analytics/**
- images/**
- .github/workflows/custom_docker_builds.yml

Expand All @@ -20,7 +22,7 @@ jobs:
matrix:
include:
- docker-image: ./images/gh-gl-sync
image-tags: ghcr.io/spack/ci-bridge:0.0.41
image-tags: ghcr.io/spack/ci-bridge:0.0.42
- docker-image: ./images/ci-key-clear
image-tags: ghcr.io/spack/ci-key-clear:0.0.2
- docker-image: ./images/gitlab-stuckpods
Expand All @@ -30,7 +32,7 @@ jobs:
- docker-image: ./images/gitlab-delete-stale-branches
image-tags: ghcr.io/spack/gitlab-delete-stale-branches:0.0.1
- docker-image: ./images/gitlab-skipped-pipelines
image-tags: ghcr.io/spack/gitlab-skipped-pipelines:0.0.1
image-tags: ghcr.io/spack/gitlab-skipped-pipelines:0.0.2
- docker-image: ./images/notary
image-tags: ghcr.io/spack/notary:latest
- docker-image: ./images/python-aws-bash
Expand All @@ -40,20 +42,20 @@ jobs:
- docker-image: ./images/cache-indexer
image-tags: ghcr.io/spack/cache-indexer:0.0.3
- docker-image: ./analytics
image-tags: ghcr.io/spack/django:0.3.5
image-tags: ghcr.io/spack/django:0.3.14
- docker-image: ./images/ci-prune-buildcache
image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.3
image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4
- docker-image: ./images/protected-publish
image-tags: ghcr.io/spack/protected-publish:0.0.1
steps:
- name: Checkout
uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3

- name: Set up QEMU
uses: docker/setup-qemu-action@68827325e0b33c7199eb31dd4e31fbe9023e06e3 # v3.0.0
uses: docker/setup-qemu-action@5927c834f5b4fdf503fca6f4c7eccda82949e1ee # v3.1.0

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # v3.3.0
uses: docker/setup-buildx-action@4fd812986e6c8c2a69e18311145f9371337f27d4 # v3.4.0

- name: Log in to the Container registry
uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446 # v3.2.0
Expand All @@ -64,7 +66,7 @@ jobs:

- name: Build ${{ (github.ref == 'refs/heads/main' && 'and push ') || '' }}${{ matrix.docker-image }}
id: docker-build-push
uses: docker/build-push-action@31159d49c0d4756269a0940a750801a1ea5d7003 # v6.1.0
uses: docker/build-push-action@a254f8ca60a858f3136a2f1f23a60969f2c402dd # v6.4.0
with:
context: ${{ matrix.docker-image }}
file: ${{ matrix.docker-image }}/Dockerfile
Expand Down
9 changes: 9 additions & 0 deletions analytics/analytics/core/job_failure_classifier/taxonomy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ taxonomy:
- "Error: errors occurred during concretization"
- "Error: concretization failed for the following reasons"
- "Spack concretizer internal error."
- "failed to concretize .+ for the following reasons"
- "variant .+ not found in package"
- "trying to set variant .+ in package .+, but the package has no such variant"

job_log_missing:
grep_for:
Expand Down Expand Up @@ -54,6 +57,10 @@ taxonomy:
grep_for:
- 'SpackError: No installed spec matches the hash'

failed_to_get_specs:
grep_for:
- 'Error: Unable to generate package index: Failed to get list of specs from'

build_error:
grep_for:
- 'error found in build log:'
Expand Down Expand Up @@ -155,6 +162,7 @@ taxonomy:
- 'Error: No version for .+ satisfies'
- 'Error: errors occurred during concretization of the environment'
- 'cannot load package .+ from the .builtin. repository'
- 'must have a default provider in /builds/spack/spack/etc/spack/defaults/packages.yaml'

invalid_pipeline_yaml:
grep_for:
Expand Down Expand Up @@ -254,6 +262,7 @@ taxonomy:
- 'db_match'
- 'db_hash'
- 'no_spec'
- 'failed_to_get_specs'
- 'ref_not_found'
- 'cmd_not_found'
- 'module_not_found'
Expand Down
32 changes: 18 additions & 14 deletions analytics/analytics/job_processor/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,11 @@ def get_pod_node_data(self, pod: str, start: datetime, end: datetime) -> NodeDat
)["metric"]["system_uuid"]
)

# Get node labels
# Get node labels. Include extra labels to prevent the results being split up
# into two sets (one before this label was added and one after). This can occur if
# the job is scheduled on a newly created node
node_labels = self.query_range(
f"kube_node_labels{{node='{node_name}'}}",
f"kube_node_labels{{node='{node_name}', label_karpenter_sh_initialized='true', label_topology_ebs_csi_aws_com_zone=~'.+'}}",
start=start,
end=end,
single_result=True,
Expand All @@ -384,19 +386,21 @@ def get_pod_node_data(self, pod: str, start: datetime, end: datetime) -> NodeDat
capacity_type = node_labels["label_karpenter_sh_capacity_type"]
instance_type = node_labels["label_node_kubernetes_io_instance_type"]

# Retrieve the price of this node
# Retrieve the price of this node. Since this price can change in the middle of this job's
# lifetime, we return all values from this query and average them.
zone = node_labels["label_topology_kubernetes_io_zone"]
spot_price = float(
self.query_range(
"karpenter_cloudprovider_instance_type_price_estimate{"
f"capacity_type='{capacity_type}',"
f"instance_type='{instance_type}',"
f"zone='{zone}'"
"}",
start=start,
end=end,
single_result=True,
)["values"][0][1]
spot_prices_result = self.query_range(
f"""
karpenter_cloudprovider_instance_type_price_estimate{{
capacity_type='{capacity_type}',
instance_type='{instance_type}',
zone='{zone}'
}}""",
start=start,
end=end,
)
spot_price = statistics.mean(
[float(val[1]) for result in spot_prices_result for val in result["values"]]
)

# Save and set as job node
Expand Down
2 changes: 1 addition & 1 deletion analytics/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ decorator==5.1.1
distlib==0.3.8
dj-database-url==2.1.0
dj-email-url==1.0.6
Django==4.2.9
Django==4.2.14
django-click==2.3.0
django-configurations==2.5
django-cors-headers==4.3.1
Expand Down
16 changes: 12 additions & 4 deletions images/ci-prune-buildcache/ci_buildcache_prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,12 @@ def configure_parser():
help="use the buildcache index to check for buildcache hashes",
action="store_true",
)
parser.add_argument(
"--fail-fast",
help="Fail immediately on an error, otherwise continue until"
"there is not more work.",
action="store_true",
)

return parser

Expand Down Expand Up @@ -467,12 +473,12 @@ def configure_parser():
fname_template = f"{args.output_dir}/delete-{{0}}-{stack}{log_suffix}.json"
if err:
print(f"errors: {stack}")
with open(fname_template.format("errors")) as fd:
with open(fname_template.format("errors", "w")) as fd:
helper.write_json(fd, err)

if fail:
print(f"failures: {stack}")
with open(fname_template.format("failures")) as fd:
with open(fname_template.format("failures", "w")) as fd:
helper.write_json(fd, fail)
else:
print(f"-- Would have deleted of {len(lines)} from {stack} buildcache")
Expand All @@ -494,8 +500,10 @@ def configure_parser():
)
except Exception as e:
print(f"Error -- Skipping pruning of {stack}")
print(str(e))
raise "ff mode" from e
if args.fail_fast:
raise e
else:
print(str(e))
finally:
if prune_file:
prune_file.close()
Expand Down
6 changes: 3 additions & 3 deletions images/gh-gl-sync/SpackCIBridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ def list_github_prs(self):
# 2) we have pushed it before, but the HEAD sha has changed since we pushed it last
log_args = ["git", "log", "--pretty=%s", "gitlab/{0}".format(pr_string)]
try:
merge_commit_msg = _durable_subprocess_run(
log_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).stdout
merge_commit_msg = subprocess.run(
log_args, check=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).stdout
match = self.merge_msg_regex.match(merge_commit_msg.decode("utf-8"))
if match and (match.group(1) == pull.head.sha or match.group(2) == pull.head.sha):
print("Skip pushing {0} because GitLab already has HEAD {1}".format(pr_string, pull.head.sha))
Expand Down Expand Up @@ -232,7 +232,7 @@ def list_github_prs(self):
backlogged = f"GitHub HEAD shas out of sync (repo={r_sha}, API={a_sha})"
push = False
# Check if our PR's merge base is an ancestor of the latest tested main branch commit.
elif _durable_subprocess_run(
elif subprocess.run(
["git", "merge-base", "--is-ancestor", merge_base_sha, self.latest_tested_main_commit]
).returncode == 0:
print(f"{tmp_pr_branch}'s merge base IS an ancestor of latest_tested_main "
Expand Down
30 changes: 27 additions & 3 deletions images/gitlab-skipped-pipelines/skipped_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import urllib.parse
from datetime import datetime, timedelta, timezone

import requests
from requests import Session
from requests.adapters import HTTPAdapter, Retry
import sentry_sdk

sentry_sdk.init(
Expand All @@ -21,13 +22,30 @@
"PRIVATE-TOKEN": os.environ.get("GITLAB_TOKEN", None)
}

session = Session()
session.mount(
"https://",
HTTPAdapter(
max_retries=Retry(
total=5,
backoff_factor=2,
backoff_jitter=1,
),
),
)


def paginate(query_url):
"""Helper method to get all pages of paginated query results"""
results = []

while query_url:
resp = requests.get(query_url, headers=AUTH_HEADER)
try:
resp = session.get(query_url, headers=AUTH_HEADER, timeout=10)
except OSError as e:
print(f"Request to {query_url} failed")
sentry_sdk.capture_exception(e)
return []

if resp.status_code == 401:
print(" !!! Unauthorized to make request, check GITLAB_TOKEN !!!")
Expand Down Expand Up @@ -59,7 +77,13 @@ def run_new_pipeline(pipeline_ref):
enc_ref = urllib.parse.quote_plus(pipeline_ref)
run_url = f"{GITLAB_API_URL}/pipeline?ref={enc_ref}"
print(f" !!!! running new pipeline for {pipeline_ref}")
print_response(requests.post(run_url, headers=AUTH_HEADER), " ")
try:
resp = session.post(run_url, headers=AUTH_HEADER, timeout=10)
except OSError as e:
print(f"Request to {run_url} failed")
sentry_sdk.capture_exception(e)
return None
print_response(resp, " ")


def find_and_run_skipped_pipelines():
Expand Down
2 changes: 1 addition & 1 deletion k8s/production/custom/gh-gl-sync/cron-jobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
restartPolicy: Never
containers:
- name: sync
image: ghcr.io/spack/ci-bridge:0.0.41
image: ghcr.io/spack/ci-bridge:0.0.42
imagePullPolicy: IfNotPresent
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion k8s/production/custom/prune-buildcache/cron-jobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
restartPolicy: Never
containers:
- name: pruner
image: ghcr.io/spack/ci-prune-buildcache:0.0.3
image: ghcr.io/spack/ci-prune-buildcache:0.0.4
imagePullPolicy: IfNotPresent
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion k8s/production/custom/skipped-pipelines/cron-jobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ spec:
restartPolicy: Never
containers:
- name: skipped-pipelines
image: ghcr.io/spack/gitlab-skipped-pipelines:0.0.1
image: ghcr.io/spack/gitlab-skipped-pipelines:0.0.2
imagePullPolicy: IfNotPresent
resources:
requests:
Expand Down
4 changes: 2 additions & 2 deletions k8s/production/custom/webhook-handler/deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
serviceAccountName: webhook-handler
containers:
- name: webhook-handler
image: ghcr.io/spack/django:0.3.5
image: ghcr.io/spack/django:0.3.14
imagePullPolicy: Always
resources:
requests:
Expand Down Expand Up @@ -146,7 +146,7 @@ spec:
serviceAccountName: webhook-handler
containers:
- name: webhook-handler-worker
image: ghcr.io/spack/django:0.3.5
image: ghcr.io/spack/django:0.3.14
command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"]
imagePullPolicy: Always
resources:
Expand Down
Loading

0 comments on commit 4747d39

Please sign in to comment.