Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address uncaught exceptions in webhook handler #952

Merged
merged 7 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/custom_docker_builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
- docker-image: ./images/cache-indexer
image-tags: ghcr.io/spack/cache-indexer:0.0.3
- docker-image: ./analytics
image-tags: ghcr.io/spack/django:0.3.15
image-tags: ghcr.io/spack/django:0.3.16
- docker-image: ./images/ci-prune-buildcache
image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4
- docker-image: ./images/protected-publish
Expand Down
3 changes: 3 additions & 0 deletions analytics/analytics/job_processor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,5 +137,8 @@ def process_job(job_input_data_json: str):

with transaction.atomic():
job = create_job_fact(gl, gl_job, job_input_data, job_trace)

# Create build timing facts in a separate transaction, in case this fails
with transaction.atomic():
if job.job.job_type == JobDataDimension.JobType.BUILD:
create_build_timing_facts(job_fact=job, gljob=gl_job)
11 changes: 9 additions & 2 deletions analytics/analytics/job_processor/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dataclasses import dataclass

import yaml
from gitlab.exceptions import GitlabGetError
from gitlab.v4.objects import ProjectJob


Expand All @@ -30,9 +31,15 @@ def get_job_artifacts_file(job: ProjectJob, filename: str):
"""Yields a file IO, raises KeyError if the filename is not present"""
with tempfile.NamedTemporaryFile(suffix=".zip") as temp:
artifacts_file = temp.name
with open(artifacts_file, "wb") as f:
job.artifacts(streamed=True, action=f.write)

# Download artifacts zip
try:
with open(artifacts_file, "wb") as f:
job.artifacts(streamed=True, action=f.write)
except GitlabGetError:
raise JobArtifactFileNotFound(job, filename)

# Open specific file within artifacts zip
with zipfile.ZipFile(artifacts_file) as zfile:
try:
with zfile.open(filename) as timing_file:
Expand Down
4 changes: 3 additions & 1 deletion analytics/analytics/job_processor/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def create_runner_dimension(gl: gitlab.Gitlab, gljob: ProjectJob) -> RunnerDimen
in_cluster = True

# Create and return new runner
return RunnerDimension.objects.create(
runner, _ = RunnerDimension.objects.get_or_create(
runner_id=runner_id,
name=runner_name,
platform=runner.platform,
Expand All @@ -195,6 +195,8 @@ def create_runner_dimension(gl: gitlab.Gitlab, gljob: ProjectJob) -> RunnerDimen
in_cluster=in_cluster,
)

return runner


def create_package_dimension(info: PackageInfo | None) -> PackageDimension:
if info is None:
Expand Down
85 changes: 50 additions & 35 deletions analytics/analytics/job_processor/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
JobArtifactVariablesNotFound,
get_job_artifacts_data,
)
from analytics.job_processor.prometheus import PrometheusClient
from analytics.job_processor.prometheus import (
JobPrometheusDataNotFound,
PrometheusClient,
UnexpectedPrometheusResult,
)


@dataclass(frozen=True)
Expand Down Expand Up @@ -93,43 +97,11 @@ class JobInfo:
node: NodeInfo | None = None


@functools.lru_cache(maxsize=128, typed=True)
def retrieve_job_info(gljob: ProjectJob, is_build: bool) -> JobInfo:
"""Retrieve job info for a job.

This is cached as it may be invoked by different functions to retrieve the same underlying data.
"""
def retrieve_job_prometheus_info(gljob: ProjectJob, is_build: bool):
client = PrometheusClient(settings.PROMETHEUS_URL)
pod_name = client.get_pod_name_from_gitlab_job(gljob=gljob)
if pod_name is None:
if not is_build:
return JobInfo()

# If the build is failed, this is not unexpected. Otherwise, raise the error
try:
artifacts = get_job_artifacts_data(gljob)
except (JobArtifactFileNotFound, JobArtifactVariablesNotFound):
if gljob.status == "failed":
return JobInfo()

raise

return JobInfo(
package=PackageInfo(
name=artifacts.package_name,
hash=artifacts.package_hash,
version=artifacts.package_version,
compiler_name=artifacts.compiler_name,
compiler_version=artifacts.compiler_version,
arch=artifacts.arch,
variants=artifacts.package_variants,
),
misc=JobMiscInfo(
job_size=artifacts.job_size,
stack=artifacts.stack,
build_jobs=artifacts.build_jobs,
),
)
raise JobPrometheusDataNotFound(gljob.id)

# Retrieve the remaining info from prometheus
start = isoparse(gljob.started_at)
Expand Down Expand Up @@ -187,3 +159,46 @@ def retrieve_job_info(gljob: ProjectJob, is_build: bool) -> JobInfo:
build_jobs=pod_labels.build_jobs,
),
)


@functools.lru_cache(maxsize=128, typed=True)
def retrieve_job_info(gljob: ProjectJob, is_build: bool) -> JobInfo:
"""Retrieve job info for a job.

This is cached as it may be invoked by different functions to retrieve the same underlying data.
"""

try:
return retrieve_job_prometheus_info(gljob=gljob, is_build=is_build)
except (JobPrometheusDataNotFound, UnexpectedPrometheusResult):
pass

# Handle non-cluster jobs or jobs with failed prometheus info
if not is_build:
return JobInfo()

# If the build is failed, this is not unexpected. Otherwise, raise the error
try:
artifacts = get_job_artifacts_data(gljob)
except (JobArtifactFileNotFound, JobArtifactVariablesNotFound):
if gljob.status == "failed":
return JobInfo()

raise

return JobInfo(
package=PackageInfo(
name=artifacts.package_name,
hash=artifacts.package_hash,
version=artifacts.package_version,
compiler_name=artifacts.compiler_name,
compiler_version=artifacts.compiler_version,
arch=artifacts.arch,
variants=artifacts.package_variants,
),
misc=JobMiscInfo(
job_size=artifacts.job_size,
stack=artifacts.stack,
build_jobs=artifacts.build_jobs,
),
)
23 changes: 16 additions & 7 deletions analytics/analytics/job_processor/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,13 +271,18 @@ def get_pod_usage_and_occupancy(
node_occupancy = calculate_node_occupancy(results, step)

# Finally, determine the node memory usage
memory_usage = self.query_range(
memory_usage: list = self.query_range(
f"container_memory_working_set_bytes{{container='build', pod='{pod}'}}",
start=start,
end=end,
step=step,
single_result=True,
)["values"]
if not memory_usage:
raise UnexpectedPrometheusResult(
message=f"Pod {pod} not found in memory usage query",
query=cpu_seconds_query,
)

# Results consist of arrays: [timestamp, "value"]
byte_values = [int(x) for _, x in memory_usage]
Expand Down Expand Up @@ -403,16 +408,20 @@ def get_pod_node_data(self, pod: str, start: datetime, end: datetime) -> NodeDat
# Retrieve the price of this node. Since this price can change in the middle of this job's
# lifetime, we return all values from this query and average them.
zone = node_labels["label_topology_kubernetes_io_zone"]
spot_prices_result = self.query_range(
f"""
price_query = f"""
karpenter_cloudprovider_instance_type_price_estimate{{
capacity_type='{capacity_type}',
instance_type='{instance_type}',
zone='{zone}'
}}""",
start=start,
end=end,
)
}}"""

spot_prices_result = self.query_range(query=price_query, start=start, end=end)
if not spot_prices_result:
raise UnexpectedPrometheusResult(
message=f"Node with parameters: ({capacity_type}, {instance_type}, {zone}) not found in spot price query",
query=price_query,
)

spot_price = statistics.mean(
[float(val[1]) for result in spot_prices_result for val in result["values"]]
)
Expand Down
4 changes: 2 additions & 2 deletions k8s/production/custom/webhook-handler/deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
serviceAccountName: webhook-handler
containers:
- name: webhook-handler
image: ghcr.io/spack/django:0.3.15
image: ghcr.io/spack/django:0.3.16
imagePullPolicy: Always
resources:
requests:
Expand Down Expand Up @@ -146,7 +146,7 @@ spec:
serviceAccountName: webhook-handler
containers:
- name: webhook-handler-worker
image: ghcr.io/spack/django:0.3.15
image: ghcr.io/spack/django:0.3.16
command: ["celery", "-A", "analytics.celery", "worker", "-l", "info", "-Q", "celery"]
imagePullPolicy: Always
resources:
Expand Down