Skip to content

Commit dd9a1e5

Browse files
authored
K8s IaC with OCI GenAI Instance Principals (#314)
* K8s IaC with OCI GenAI Instance Principals
1 parent 4670b21 commit dd9a1e5

File tree

13 files changed

+194
-98
lines changed

13 files changed

+194
-98
lines changed

.github/workflows/pytest.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ jobs:
4444
uv pip install torch==2.9.0+cpu -f https://download.pytorch.org/whl/cpu/torch --system
4545
uv pip install -e ".[all-test]" --system
4646
47+
- name: Run Pylint on IaC Code
48+
run: pylint opentofu
49+
4750
- name: Run Pylint on Client Code
4851
run: pylint src/client
4952

opentofu/cfgmgt/apply.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,11 @@ def helm_repo_add_if_missing():
9494

9595
def apply_helm_chart_inner(release_name, namespace):
9696
"""Apply Helm Chart"""
97-
values_path = os.path.join(STAGE_PATH, "helm-values.yaml")
97+
values_path = os.path.join(STAGE_PATH, "optimizer-helm-values.yaml")
9898
if not os.path.isfile(values_path):
9999
print(f"⚠️ Values file not found: {values_path}")
100-
return False
100+
print("ℹ️ Skipping Helm chart application.\n")
101+
return True # Return True to indicate this is not a retriable failure
101102

102103
helm_repo_add_if_missing()
103104

@@ -119,41 +120,60 @@ def apply_helm_chart_inner(release_name, namespace):
119120
print("✅ Helm chart applied:")
120121
print(f"Apply Helm Chart: {stdout}")
121122
return True
122-
else:
123-
print(f"❌ Failed to apply Helm chart:\n{stderr}")
124-
return False
123+
124+
print(f"❌ Failed to apply Helm chart:\n{stderr}")
125+
return False
125126

126127

127128
def apply_helm_chart(release_name, namespace):
128129
"""Retry Enabled Add/Update Helm Chart"""
129130
retry(lambda: apply_helm_chart_inner(release_name, namespace))
130131

131132

132-
def apply_manifest_inner():
133+
def apply_manifest_inner(namespace):
133134
"""Apply Manifest"""
134135
manifest_path = os.path.join(STAGE_PATH, "k8s-manifest.yaml")
135136
if not os.path.isfile(manifest_path):
136137
print(f"⚠️ Manifest not found: {manifest_path}")
137138
return False
138139

140+
# Delete existing Jobs with the same name to allow recreation
141+
# Jobs are immutable and cannot be updated, only replaced
142+
print("🗑️ Checking for existing buildkit Job...")
143+
stdout, _, _ = run_cmd(
144+
["kubectl", "get", "job", "optimizer-buildkit", "-n", namespace, "-o", "name"], capture_output=True
145+
)
146+
if stdout:
147+
print(f"🗑️ Deleting existing optimizer-buildkit Job in namespace '{namespace}'...")
148+
run_cmd(
149+
["kubectl", "delete", "job", "optimizer-buildkit", "-n", namespace, "--ignore-not-found=true"],
150+
capture_output=False,
151+
)
152+
time.sleep(2) # Wait for deletion to complete
153+
139154
print("🚀 Applying Kubernetes manifest: k8s-manifest.yaml")
140155
_, stderr, rc = run_cmd(["kubectl", "apply", "-f", manifest_path], capture_output=False)
141156
if rc == 0:
142157
print("✅ Manifest applied.\n")
143158
return True
144-
else:
145-
print(f"❌ Failed to apply manifest:\n{stderr}")
146-
return False
159+
160+
print(f"❌ Failed to apply manifest:\n{stderr}")
161+
return False
147162

148163

149-
def apply_manifest():
164+
def apply_manifest(namespace):
150165
"""Retry Enabled Add/Update Manifest"""
151-
retry(apply_manifest_inner)
166+
retry(lambda: apply_manifest_inner(namespace))
152167

153168

154169
def patch_oracle_operator_inner():
155170
"""Patch Oracle Database Operator deployment to disable readOnlyRootFilesystem"""
156171
print("🔧 Patching oracle-database-operator deployment...")
172+
patch_json = (
173+
'[{"op": "replace", "path": '
174+
'"/spec/template/spec/containers/0/securityContext/readOnlyRootFilesystem", '
175+
'"value": false}]'
176+
)
157177
cmd = [
158178
"kubectl",
159179
"-n",
@@ -164,15 +184,15 @@ def patch_oracle_operator_inner():
164184
"--type",
165185
"json",
166186
"-p",
167-
'[{"op": "replace", "path": "/spec/template/spec/containers/0/securityContext/readOnlyRootFilesystem", "value": false}]',
187+
patch_json,
168188
]
169189
_, stderr, rc = run_cmd(cmd, capture_output=False)
170190
if rc == 0:
171191
print("✅ Oracle operator patched.\n")
172192
return True
173-
else:
174-
print(f"❌ Failed to patch operator:\n{stderr}")
175-
return False
193+
194+
print(f"❌ Failed to patch operator:\n{stderr}")
195+
return False
176196

177197

178198
def patch_oracle_operator():
@@ -189,6 +209,6 @@ def patch_oracle_operator():
189209
args = parser.parse_args()
190210

191211
mod_kubeconfig(args.private_endpoint)
192-
apply_manifest()
212+
apply_manifest(args.namespace)
193213
patch_oracle_operator()
194214
apply_helm_chart(args.release_name, args.namespace)

opentofu/modules/kubernetes/cfgmgt.tf

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,60 +3,63 @@
33
# spell-checker: disable
44

55
locals {
6-
helm_values = templatefile("${path.module}/templates/helm_values.yaml", {
7-
label = var.label_prefix
8-
repository_server = local.repository_server
9-
repository_client = local.repository_client
10-
oci_tenancy = var.tenancy_id
11-
oci_region = var.region
12-
db_type = var.db_conn.db_type
13-
db_ocid = var.db_ocid
14-
db_dsn = var.db_conn.service
15-
db_name = lower(var.db_name)
16-
node_pool_gpu_deploy = var.node_pool_gpu_deploy
17-
lb_ip = var.lb.ip_address_details[0].ip_address
6+
k8s_manifest = templatefile("${path.module}/templates/k8s_manifest.yaml", {
7+
label = var.label_prefix
8+
repository_host = local.repository_host
9+
optimizer_repository_server = local.optimizer_repository_server
10+
optimizer_repository_client = local.optimizer_repository_client
11+
compartment_ocid = var.lb.compartment_id
12+
lb_ocid = var.lb.id
13+
lb_subnet_ocid = var.public_subnet_id
14+
lb_ip_ocid = var.lb.ip_address_details[0].ip_address
15+
lb_nsgs = var.lb_nsg_id
16+
lb_min_shape = var.lb.shape_details[0].minimum_bandwidth_in_mbps
17+
lb_max_shape = var.lb.shape_details[0].maximum_bandwidth_in_mbps
18+
db_name = lower(var.db_name)
19+
db_username = var.db_conn.username
20+
db_password = var.db_conn.password
21+
db_service = var.db_conn.service
22+
optimizer_api_key = random_string.optimizer_api_key.result
23+
deploy_buildkit = var.byo_ocir_url == ""
24+
deploy_optimizer = var.deploy_optimizer
25+
optimizer_version = var.optimizer_version
1826
})
1927

20-
k8s_manifest = templatefile("${path.module}/templates/k8s_manifest.yaml", {
21-
label = var.label_prefix
22-
repository_host = local.repository_host
23-
repository_server = local.repository_server
24-
repository_client = local.repository_client
25-
compartment_ocid = var.lb.compartment_id
26-
lb_ocid = var.lb.id
27-
lb_subnet_ocid = var.public_subnet_id
28-
lb_ip_ocid = var.lb.ip_address_details[0].ip_address
29-
lb_nsgs = var.lb_nsg_id
30-
lb_min_shape = var.lb.shape_details[0].minimum_bandwidth_in_mbps
31-
lb_max_shape = var.lb.shape_details[0].maximum_bandwidth_in_mbps
32-
db_name = lower(var.db_name)
33-
db_username = var.db_conn.username
34-
db_password = var.db_conn.password
35-
db_service = var.db_conn.service
36-
api_key = random_string.api_key.result
37-
deploy_buildkit = var.byo_ocir_url == ""
38-
optimizer_version = var.optimizer_version
28+
helm_values = templatefile("${path.module}/templates/optimizer_helm_values.yaml", {
29+
label = var.label_prefix
30+
optimizer_repository_server = local.optimizer_repository_server
31+
optimizer_repository_client = local.optimizer_repository_client
32+
oci_tenancy = var.tenancy_id
33+
oci_region = var.region
34+
db_type = var.db_conn.db_type
35+
db_ocid = var.db_ocid
36+
db_dsn = var.db_conn.service
37+
db_name = lower(var.db_name)
38+
node_pool_gpu_deploy = var.node_pool_gpu_deploy
39+
lb_ip = var.lb.ip_address_details[0].ip_address
3940
})
4041
}
4142

43+
4244
resource "local_sensitive_file" "kubeconfig" {
4345
content = data.oci_containerengine_cluster_kube_config.default_cluster_kube_config.content
4446
filename = "${path.root}/cfgmgt/stage/kubeconfig"
4547
file_permission = 0600
4648
}
4749

48-
resource "local_sensitive_file" "helm_values" {
49-
content = local.helm_values
50-
filename = "${path.root}/cfgmgt/stage/helm-values.yaml"
51-
file_permission = 0600
52-
}
53-
5450
resource "local_sensitive_file" "k8s_manifest" {
5551
content = local.k8s_manifest
5652
filename = "${path.root}/cfgmgt/stage/k8s-manifest.yaml"
5753
file_permission = 0600
5854
}
5955

56+
resource "local_sensitive_file" "optimizer_helm_values" {
57+
count = var.deploy_optimizer ? 1 : 0
58+
content = local.helm_values
59+
filename = "${path.root}/cfgmgt/stage/optimizer-helm-values.yaml"
60+
file_permission = 0600
61+
}
62+
6063
resource "null_resource" "apply" {
6164
count = var.run_cfgmgt ? 1 : 0
6265
triggers = {
@@ -77,8 +80,8 @@ resource "null_resource" "apply" {
7780
}
7881
depends_on = [
7982
local_sensitive_file.kubeconfig,
80-
local_sensitive_file.helm_values,
8183
local_sensitive_file.k8s_manifest,
84+
local_sensitive_file.optimizer_helm_values,
8285
oci_containerengine_node_pool.cpu_node_pool_details,
8386
oci_containerengine_node_pool.gpu_node_pool_details,
8487
oci_containerengine_addon.oraoper_addon,

opentofu/modules/kubernetes/locals.tf

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,16 @@ locals {
5454

5555
// Repositories
5656
locals {
57-
container_repositories = [
57+
optimizer_container_repositories = [
5858
"ai-optimizer-server",
5959
"ai-optimizer-client"
6060
]
61-
region_map = { for r in data.oci_identity_regions.identity_regions.regions : r.name => r.key }
62-
image_region = lookup(local.region_map, var.region)
63-
repository_host = lower(format("%s.ocir.io", local.image_region))
64-
repository_base = var.byo_ocir_url != "" ? var.byo_ocir_url : lower(format("%s/%s", local.repository_host, data.oci_objectstorage_namespace.objectstorage_namespace.namespace))
65-
repository_server = lower(format("%s/ai-optimizer-server", local.repository_base))
66-
repository_client = lower(format("%s/ai-optimizer-client", local.repository_base))
61+
region_map = { for r in data.oci_identity_regions.identity_regions.regions : r.name => r.key }
62+
image_region = lookup(local.region_map, var.region)
63+
repository_host = lower(format("%s.ocir.io", local.image_region))
64+
repository_base = var.byo_ocir_url != "" ? var.byo_ocir_url : lower(format("%s/%s/%s", local.repository_host, data.oci_objectstorage_namespace.objectstorage_namespace.namespace, var.label_prefix))
65+
optimizer_repository_server = lower(format("%s/ai-optimizer-server", local.repository_base))
66+
optimizer_repository_client = lower(format("%s/ai-optimizer-client", local.repository_base))
6767
}
6868

6969
// Cluster Details

opentofu/modules/kubernetes/main.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl
33
# spell-checker: disable
44

5-
resource "random_string" "api_key" {
5+
resource "random_string" "optimizer_api_key" {
66
length = 32
77
special = true
88
upper = true
@@ -14,7 +14,7 @@ resource "random_string" "api_key" {
1414
// oci_artifacts_container_repository
1515
// OCIR
1616
resource "oci_artifacts_container_repository" "optimizer_repositories" {
17-
for_each = var.byo_ocir_url != "" ? toset([]) : toset(local.container_repositories)
17+
for_each = var.byo_ocir_url != "" ? toset([]) : toset(local.optimizer_container_repositories)
1818
compartment_id = var.compartment_id
1919
display_name = lower(format("%s/%s", var.label_prefix, each.value))
2020
is_immutable = false

opentofu/modules/kubernetes/templates/k8s_manifest.yaml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ metadata:
1111
apiVersion: v1
1212
kind: Secret
1313
metadata:
14-
name: ${label}-api-key
14+
name: ${label}-optimizer-api-key
1515
namespace: ${label}
1616
type: Opaque
1717
stringData:
18-
apiKey: ${api_key}
18+
apiKey: ${optimizer_api_key}
1919
---
2020
# Secret containing non-privileged DB User details.
2121
# Used for application connectivity to database.
@@ -75,14 +75,14 @@ spec:
7575
apiGroup: ingress.oraclecloud.com
7676
kind: IngressClassParameters
7777
name: native-ic-params
78-
%{ if deploy_buildkit ~}
78+
%{ if deploy_buildkit && deploy_optimizer ~}
7979
---
8080
# Builds and pushes application images to container registry.
8181
# Uses instance principles.
8282
apiVersion: batch/v1
8383
kind: Job
8484
metadata:
85-
name: buildkit
85+
name: optimizer-buildkit
8686
namespace: ${label}
8787
spec:
8888
ttlSecondsAfterFinished: 300
@@ -97,11 +97,11 @@ spec:
9797
- -c
9898
- |
9999
if [ "${optimizer_version}" = "Experimental" ]; then
100+
echo "Downloading Code from MAIN branch"
100101
wget -qO- https://github.com/oracle/ai-optimizer/archive/refs/heads/main.tar.gz \
101-
| tar -xz -C /workspace ai-optimizer-main/src ai-optimizer-main/pyproject.toml --strip-components=1
102-
mv /workspace/src/* /workspace/src/.* /workspace/ 2>/dev/null || true
103-
rm -rf /workspace/src
102+
| tar -xz -C /workspace --strip-components=1 ai-optimizer-main/src ai-optimizer-main/pyproject.toml
104103
else
104+
echo "Downloading Code from LATEST release"
105105
wget -qO- https://github.com/oracle/ai-optimizer/releases/latest/download/ai-optimizer-src.tar.gz \
106106
| tar -xz -C /workspace
107107
fi
@@ -118,7 +118,7 @@ spec:
118118
- -c
119119
- |
120120
RETRY_COUNT=0
121-
REPO_PATH=$(echo "${repository_client}" | cut -d'/' -f2-)
121+
REPO_PATH=$(echo "${optimizer_repository_client}" | cut -d'/' -f2-)
122122
123123
while [ $RETRY_COUNT -lt 10 ]; do
124124
RETRY_COUNT=$((RETRY_COUNT + 1))
@@ -130,7 +130,7 @@ spec:
130130
chown 1000:1000 /docker-config/config.json
131131
132132
HTTP_STATUS=$(oci raw-request --http-method GET \
133-
--target-uri "https://${repository_host}/v2/$REPO_PATH/tags/list" \
133+
--target-uri "https://${repository_host}/v2/$REPO_PATH/manifests/latest" \
134134
--request-headers "{\"Authorization\": \"Bearer $TOKEN\"}" 2>/dev/null | jq -r '.status' || echo "000")
135135
136136
HTTP_CODE=$(echo "$HTTP_STATUS" | cut -d' ' -f1)
@@ -168,8 +168,8 @@ spec:
168168
--progress plain \
169169
--frontend dockerfile.v0 \
170170
--local context=/workspace \
171-
--local dockerfile=/workspace/client \
172-
--output type=image,name=${repository_client}:latest,push=true
171+
--local dockerfile=/workspace/src/client \
172+
--output type=image,name=${optimizer_repository_client}:latest,push=true
173173
securityContext:
174174
seccompProfile:
175175
type: Unconfined
@@ -201,8 +201,8 @@ spec:
201201
--progress plain \
202202
--frontend dockerfile.v0 \
203203
--local context=/workspace \
204-
--local dockerfile=/workspace/server \
205-
--output type=image,name=${repository_server}:latest,push=true
204+
--local dockerfile=/workspace/src/server \
205+
--output type=image,name=${optimizer_repository_server}:latest,push=true
206206
securityContext:
207207
seccompProfile:
208208
type: Unconfined

opentofu/modules/kubernetes/templates/helm_values.yaml renamed to opentofu/modules/kubernetes/templates/optimizer_helm_values.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
global:
66
api:
7-
secretName: "${label}-api-key"
7+
secretName: "${label}-optimizer-api-key"
88

99
# -- API Server configuration
1010
server:
1111
image:
12-
repository: ${repository_server}
12+
repository: ${optimizer_repository_server}
1313
tag: "latest"
1414
pullPolicy: Always
1515

@@ -48,7 +48,7 @@ server:
4848
client:
4949
enable: true
5050
image:
51-
repository: ${repository_client}
51+
repository: ${optimizer_repository_client}
5252
tag: "latest"
5353
pullPolicy: Always
5454

opentofu/modules/kubernetes/variables.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,4 +132,9 @@ variable "byo_ocir_url" {
132132

133133
variable "optimizer_version" {
134134
type = string
135+
}
136+
137+
variable "deploy_optimizer" {
138+
type = bool
139+
default = true
135140
}

0 commit comments

Comments
 (0)