Skip to content

Commit cfef38b

Browse files
authored
✨ Add cache for CloudBuild API location queries (apple#967)
1 parent 8fd9137 commit cfef38b

File tree

2 files changed

+451
-165
lines changed

2 files changed

+451
-165
lines changed

axlearn/cloud/gcp/cloud_build.py

Lines changed: 86 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,18 @@ def is_pending(self) -> bool:
6464

6565

6666
def _get_build_request_filter(*, image_name: str, tags: list[str]) -> str:
67-
# To filter builds by multiple tags, use "AND", "OR", or "NOT" to list tags.
68-
# Example: '(tags = tag1 AND tags = tag2) OR results.images.name="image"'.
67+
"""Constructs a filter string to query build requests based on image name and tags.
68+
69+
To filter builds by multiple tags, use "AND", "OR", or "NOT" to list tags.
70+
Example: '(tags = tag1 AND tags = tag2) OR results.images.name="image"'.
71+
72+
Args:
73+
image_name: The name of the image to filter build requests by.
74+
tags: A list of tags to filter build requests by.
75+
76+
Returns:
77+
str: A filter string suitable for use in build request queries.
78+
"""
6979
filter_by_tag = ""
7080
if tags:
7181
filter_by_tag = "(" + " AND ".join(f"tags = {tag}" for tag in tags) + ")" + " OR "
@@ -86,59 +96,68 @@ def _list_available_regions(project_id: str) -> list[str]:
8696
Exception: If an error occurs when retrieving regions from the Compute Engine API.
8797
"""
8898
try:
89-
# Initialize the Compute Engine client.
9099
client = RegionsClient()
91-
92-
# List all regions for the given project.
93100
request = ListRegionsRequest(project=project_id)
94101
regions = client.list(request=request)
95-
96-
# Extract and return region names as list.
97102
return [region.name for region in regions]
98103
except Exception as e:
99-
logging.error("Failed to look up regions for project: %s", e)
104+
logging.error("Failed to look up regions for project '%s': %s", project_id, e)
100105
raise
101106

102107

103-
def _get_cloud_build_status_for_region(
104-
*, project_id: str, image_name: str, tags: list[str], region: str = "global"
105-
) -> Optional[CloudBuildStatus]:
106-
"""Gets the status of the latest build by filtering on the build tags, image name, and region.
108+
def _list_builds_in_region(
109+
project_id: str, image_name: str, tags: tuple[str, ...], region: str
110+
) -> list[Build]:
111+
"""Lists all builds for a given combination of region, project, image name, and tags.
107112
108113
Args:
109114
project_id: The GCP project ID.
110-
region: The GCP region. Defaults to 'global' if no region is given.
111-
image_name: The image name including the image path of the Artifact Registry.
112-
tags: A list of the CloudBuild build tags. Note that these are not docker image tags.
115+
image_name: The name of the Docker image.
116+
tags: A tuple of build tags to filter the builds.
117+
region: The region to query for builds.
113118
114119
Returns:
115-
CloudBuild status for the latest build in this region.
116-
None if no build found for the image name in the given region.
120+
A list of CloudBuild Builds matching the criteria.
121+
"""
122+
client = cloudbuild_v1.CloudBuildClient()
123+
request = cloudbuild_v1.ListBuildsRequest(
124+
parent=f"projects/{project_id}/locations/{region}",
125+
project_id=project_id,
126+
filter=_get_build_request_filter(image_name=image_name, tags=list(tags)),
127+
)
128+
return list(client.list_builds(request=request))
117129

118-
Raises:
119-
Exception: On failure to get the latest build status of a given image in a GCP project.
130+
131+
def _get_latest_build_status_in_region(
132+
project_id: str, image_name: str, tags: tuple[str, ...], region: str
133+
) -> Optional[CloudBuildStatus]:
134+
"""Gets the CloudBuild status for the latest build in a given region (no caching).
135+
136+
Args:
137+
project_id: The GCP project ID.
138+
image_name: The name of the Docker image.
139+
tags: A tuple of build tags to filter the builds.
140+
region: The region to query for the latest build.
141+
142+
Returns:
143+
The CloudBuildStatus of the latest build, or None if no build is found.
120144
"""
121145
try:
122-
client = cloudbuild_v1.CloudBuildClient()
123-
request = cloudbuild_v1.ListBuildsRequest(
124-
# CloudBuild lookups are region-specific.
125-
parent=f"projects/{project_id}/locations/{region}",
126-
project_id=project_id,
127-
filter=_get_build_request_filter(image_name=image_name, tags=tags),
146+
builds = _list_builds_in_region(
147+
project_id=project_id, image_name=image_name, tags=tags, region=region
128148
)
129-
builds = list(client.list_builds(request=request))
130-
131149
if not builds:
132-
logging.warning("No builds found in region '%s' for image '%s'", image_name, region)
150+
logging.info("No builds found in region '%s' for image '%s'.", region, image_name)
133151
return None
134152

153+
# Sort builds by creation time and pick the latest.
135154
builds.sort(key=lambda build: build.create_time)
136-
logging.info("Build found in region '%s' for image '%s': %s", region, image_name, builds)
137-
138155
latest_build = builds[-1]
156+
logging.info(
157+
"Latest build found in region '%s' for image '%s': %s", region, image_name, latest_build
158+
)
139159
return CloudBuildStatus.from_build_status(latest_build.status)
140160

141-
# TODO(liang-he): Distinguish retryable and non-retryable google.api_core.exceptions
142161
except Exception as e:
143162
logging.warning(
144163
"Failed to find the build for image '%s' in region '%s', exception: %s",
@@ -149,33 +168,53 @@ def _get_cloud_build_status_for_region(
149168
raise
150169

151170

171+
# In-memory memo to store the last known region for a given (project_id, image_name, tags).
172+
_last_known_region_for_build = {}
173+
174+
152175
def get_cloud_build_status(
153176
*, project_id: str, image_name: str, tags: list[str]
154177
) -> Optional[CloudBuildStatus]:
155178
"""Gets the status of the latest CloudBuild by filtering on the build tags and image name.
156179
157-
Performs a request for each available region, including 'global' first.
180+
In order:
181+
1. Queries the last known region where a build was previously found (if any).
182+
2. Queries all regions if not found above.
183+
184+
The build results are not cached to ensure the latest build status is always retrieved.
158185
159186
Args:
160187
project_id: The GCP project ID.
161-
image_name: The image name including the image path of the Artifact Registry.
162-
tags: A list of the CloudBuild build tags. Note that these are not docker image tags.
188+
image_name: The name of the image.
189+
tags: A list of tags used to filter the builds.
163190
164191
Returns:
165-
CloudBuild status for the latest build found in the first available region.
166-
None if no build found for the image name and tag across all available regions.
192+
The CloudBuildStatus of the latest build, or None if no build is found.
167193
"""
168-
build_status = None
169-
# Unfortunately the CloudBuild API does not support wildcard region lookup.
170-
# Workaround: Check each region for the latest build, stopping when the first is found.
171-
# Try global (default) region first before other regions.
194+
tags_tuple = tuple(sorted(tags))
195+
196+
# If there is a last known region where a build was found previously, use it
197+
last_region = _last_known_region_for_build.get((project_id, image_name, tags_tuple))
198+
if last_region:
199+
logging.info("Checking last known region '%s' for image '%s'.", last_region, image_name)
200+
status = _get_latest_build_status_in_region(
201+
project_id=project_id, image_name=image_name, tags=tags_tuple, region=last_region
202+
)
203+
if status is not None:
204+
return status
205+
206+
# If not found yet, iterate over all available regions.
172207
all_regions = ["global"] + _list_available_regions(project_id)
173208
for region in all_regions:
174-
logging.info("Looking for CloudBuild with image '%s' in region '%s'", image_name, region)
175-
build_status = _get_cloud_build_status_for_region(
176-
project_id=project_id, image_name=image_name, tags=tags, region=region
209+
logging.info(
210+
"Checking region '%s' for image '%s' in project '%s'.", region, image_name, project_id
211+
)
212+
status = _get_latest_build_status_in_region(
213+
project_id=project_id, image_name=image_name, tags=tags_tuple, region=region
177214
)
178-
if build_status is not None:
179-
# Short-circuit so there are no extraneous queries after the first build is found.
180-
break
181-
return build_status
215+
if status is not None:
216+
_last_known_region_for_build[(project_id, image_name, tags_tuple)] = region
217+
return status
218+
219+
# No build found in any region.
220+
return None

0 commit comments

Comments
 (0)