Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ jobs:
uv pip install torch==2.9.0+cpu -f https://download.pytorch.org/whl/cpu/torch --system
uv pip install -e ".[all-test]" --system

- name: Run Pylint on IaC Code
run: pylint opentofu

- name: Run Pylint on Client Code
run: pylint src/client

Expand Down
52 changes: 36 additions & 16 deletions opentofu/cfgmgt/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,11 @@ def helm_repo_add_if_missing():

def apply_helm_chart_inner(release_name, namespace):
"""Apply Helm Chart"""
values_path = os.path.join(STAGE_PATH, "helm-values.yaml")
values_path = os.path.join(STAGE_PATH, "optimizer-helm-values.yaml")
if not os.path.isfile(values_path):
print(f"⚠️ Values file not found: {values_path}")
return False
print("ℹ️ Skipping Helm chart application.\n")
return True # Return True to indicate this is not a retriable failure

helm_repo_add_if_missing()

Expand All @@ -119,41 +120,60 @@ def apply_helm_chart_inner(release_name, namespace):
print("✅ Helm chart applied:")
print(f"Apply Helm Chart: {stdout}")
return True
else:
print(f"❌ Failed to apply Helm chart:\n{stderr}")
return False

print(f"❌ Failed to apply Helm chart:\n{stderr}")
return False


def apply_helm_chart(release_name, namespace):
"""Retry Enabled Add/Update Helm Chart"""
retry(lambda: apply_helm_chart_inner(release_name, namespace))


def apply_manifest_inner():
def apply_manifest_inner(namespace):
"""Apply Manifest"""
manifest_path = os.path.join(STAGE_PATH, "k8s-manifest.yaml")
if not os.path.isfile(manifest_path):
print(f"⚠️ Manifest not found: {manifest_path}")
return False

# Delete existing Jobs with the same name to allow recreation
# Jobs are immutable and cannot be updated, only replaced
print("🗑️ Checking for existing buildkit Job...")
stdout, _, _ = run_cmd(
["kubectl", "get", "job", "optimizer-buildkit", "-n", namespace, "-o", "name"], capture_output=True
)
if stdout:
print(f"🗑️ Deleting existing optimizer-buildkit Job in namespace '{namespace}'...")
run_cmd(
["kubectl", "delete", "job", "optimizer-buildkit", "-n", namespace, "--ignore-not-found=true"],
capture_output=False,
)
time.sleep(2) # Wait for deletion to complete

print("🚀 Applying Kubernetes manifest: k8s-manifest.yaml")
_, stderr, rc = run_cmd(["kubectl", "apply", "-f", manifest_path], capture_output=False)
if rc == 0:
print("✅ Manifest applied.\n")
return True
else:
print(f"❌ Failed to apply manifest:\n{stderr}")
return False

print(f"❌ Failed to apply manifest:\n{stderr}")
return False


def apply_manifest():
def apply_manifest(namespace):
"""Retry Enabled Add/Update Manifest"""
retry(apply_manifest_inner)
retry(lambda: apply_manifest_inner(namespace))


def patch_oracle_operator_inner():
"""Patch Oracle Database Operator deployment to disable readOnlyRootFilesystem"""
print("🔧 Patching oracle-database-operator deployment...")
patch_json = (
'[{"op": "replace", "path": '
'"/spec/template/spec/containers/0/securityContext/readOnlyRootFilesystem", '
'"value": false}]'
)
cmd = [
"kubectl",
"-n",
Expand All @@ -164,15 +184,15 @@ def patch_oracle_operator_inner():
"--type",
"json",
"-p",
'[{"op": "replace", "path": "/spec/template/spec/containers/0/securityContext/readOnlyRootFilesystem", "value": false}]',
patch_json,
]
_, stderr, rc = run_cmd(cmd, capture_output=False)
if rc == 0:
print("✅ Oracle operator patched.\n")
return True
else:
print(f"❌ Failed to patch operator:\n{stderr}")
return False

print(f"❌ Failed to patch operator:\n{stderr}")
return False


def patch_oracle_operator():
Expand All @@ -189,6 +209,6 @@ def patch_oracle_operator():
args = parser.parse_args()

mod_kubeconfig(args.private_endpoint)
apply_manifest()
apply_manifest(args.namespace)
patch_oracle_operator()
apply_helm_chart(args.release_name, args.namespace)
79 changes: 41 additions & 38 deletions opentofu/modules/kubernetes/cfgmgt.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,63 @@
# spell-checker: disable

locals {
helm_values = templatefile("${path.module}/templates/helm_values.yaml", {
label = var.label_prefix
repository_server = local.repository_server
repository_client = local.repository_client
oci_tenancy = var.tenancy_id
oci_region = var.region
db_type = var.db_conn.db_type
db_ocid = var.db_ocid
db_dsn = var.db_conn.service
db_name = lower(var.db_name)
node_pool_gpu_deploy = var.node_pool_gpu_deploy
lb_ip = var.lb.ip_address_details[0].ip_address
k8s_manifest = templatefile("${path.module}/templates/k8s_manifest.yaml", {
label = var.label_prefix
repository_host = local.repository_host
optimizer_repository_server = local.optimizer_repository_server
optimizer_repository_client = local.optimizer_repository_client
compartment_ocid = var.lb.compartment_id
lb_ocid = var.lb.id
lb_subnet_ocid = var.public_subnet_id
lb_ip_ocid = var.lb.ip_address_details[0].ip_address
lb_nsgs = var.lb_nsg_id
lb_min_shape = var.lb.shape_details[0].minimum_bandwidth_in_mbps
lb_max_shape = var.lb.shape_details[0].maximum_bandwidth_in_mbps
db_name = lower(var.db_name)
db_username = var.db_conn.username
db_password = var.db_conn.password
db_service = var.db_conn.service
optimizer_api_key = random_string.optimizer_api_key.result
deploy_buildkit = var.byo_ocir_url == ""
deploy_optimizer = var.deploy_optimizer
optimizer_version = var.optimizer_version
})

k8s_manifest = templatefile("${path.module}/templates/k8s_manifest.yaml", {
label = var.label_prefix
repository_host = local.repository_host
repository_server = local.repository_server
repository_client = local.repository_client
compartment_ocid = var.lb.compartment_id
lb_ocid = var.lb.id
lb_subnet_ocid = var.public_subnet_id
lb_ip_ocid = var.lb.ip_address_details[0].ip_address
lb_nsgs = var.lb_nsg_id
lb_min_shape = var.lb.shape_details[0].minimum_bandwidth_in_mbps
lb_max_shape = var.lb.shape_details[0].maximum_bandwidth_in_mbps
db_name = lower(var.db_name)
db_username = var.db_conn.username
db_password = var.db_conn.password
db_service = var.db_conn.service
api_key = random_string.api_key.result
deploy_buildkit = var.byo_ocir_url == ""
optimizer_version = var.optimizer_version
helm_values = templatefile("${path.module}/templates/optimizer_helm_values.yaml", {
label = var.label_prefix
optimizer_repository_server = local.optimizer_repository_server
optimizer_repository_client = local.optimizer_repository_client
oci_tenancy = var.tenancy_id
oci_region = var.region
db_type = var.db_conn.db_type
db_ocid = var.db_ocid
db_dsn = var.db_conn.service
db_name = lower(var.db_name)
node_pool_gpu_deploy = var.node_pool_gpu_deploy
lb_ip = var.lb.ip_address_details[0].ip_address
})
}


resource "local_sensitive_file" "kubeconfig" {
content = data.oci_containerengine_cluster_kube_config.default_cluster_kube_config.content
filename = "${path.root}/cfgmgt/stage/kubeconfig"
file_permission = 0600
}

resource "local_sensitive_file" "helm_values" {
content = local.helm_values
filename = "${path.root}/cfgmgt/stage/helm-values.yaml"
file_permission = 0600
}

resource "local_sensitive_file" "k8s_manifest" {
content = local.k8s_manifest
filename = "${path.root}/cfgmgt/stage/k8s-manifest.yaml"
file_permission = 0600
}

resource "local_sensitive_file" "optimizer_helm_values" {
count = var.deploy_optimizer ? 1 : 0
content = local.helm_values
filename = "${path.root}/cfgmgt/stage/optimizer-helm-values.yaml"
file_permission = 0600
}

resource "null_resource" "apply" {
count = var.run_cfgmgt ? 1 : 0
triggers = {
Expand All @@ -77,8 +80,8 @@ resource "null_resource" "apply" {
}
depends_on = [
local_sensitive_file.kubeconfig,
local_sensitive_file.helm_values,
local_sensitive_file.k8s_manifest,
local_sensitive_file.optimizer_helm_values,
oci_containerengine_node_pool.cpu_node_pool_details,
oci_containerengine_node_pool.gpu_node_pool_details,
oci_containerengine_addon.oraoper_addon,
Expand Down
14 changes: 7 additions & 7 deletions opentofu/modules/kubernetes/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,16 @@ locals {

// Repositories
locals {
container_repositories = [
optimizer_container_repositories = [
"ai-optimizer-server",
"ai-optimizer-client"
]
region_map = { for r in data.oci_identity_regions.identity_regions.regions : r.name => r.key }
image_region = lookup(local.region_map, var.region)
repository_host = lower(format("%s.ocir.io", local.image_region))
repository_base = var.byo_ocir_url != "" ? var.byo_ocir_url : lower(format("%s/%s", local.repository_host, data.oci_objectstorage_namespace.objectstorage_namespace.namespace))
repository_server = lower(format("%s/ai-optimizer-server", local.repository_base))
repository_client = lower(format("%s/ai-optimizer-client", local.repository_base))
region_map = { for r in data.oci_identity_regions.identity_regions.regions : r.name => r.key }
image_region = lookup(local.region_map, var.region)
repository_host = lower(format("%s.ocir.io", local.image_region))
repository_base = var.byo_ocir_url != "" ? var.byo_ocir_url : lower(format("%s/%s/%s", local.repository_host, data.oci_objectstorage_namespace.objectstorage_namespace.namespace, var.label_prefix))
optimizer_repository_server = lower(format("%s/ai-optimizer-server", local.repository_base))
optimizer_repository_client = lower(format("%s/ai-optimizer-client", local.repository_base))
}

// Cluster Details
Expand Down
4 changes: 2 additions & 2 deletions opentofu/modules/kubernetes/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl
# spell-checker: disable

resource "random_string" "api_key" {
resource "random_string" "optimizer_api_key" {
length = 32
special = true
upper = true
Expand All @@ -14,7 +14,7 @@ resource "random_string" "api_key" {
// oci_artifacts_container_repository
// OCIR
resource "oci_artifacts_container_repository" "optimizer_repositories" {
for_each = var.byo_ocir_url != "" ? toset([]) : toset(local.container_repositories)
for_each = var.byo_ocir_url != "" ? toset([]) : toset(local.optimizer_container_repositories)
compartment_id = var.compartment_id
display_name = lower(format("%s/%s", var.label_prefix, each.value))
is_immutable = false
Expand Down
26 changes: 13 additions & 13 deletions opentofu/modules/kubernetes/templates/k8s_manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ metadata:
apiVersion: v1
kind: Secret
metadata:
name: ${label}-api-key
name: ${label}-optimizer-api-key
namespace: ${label}
type: Opaque
stringData:
apiKey: ${api_key}
apiKey: ${optimizer_api_key}
---
# Secret containing non-privileged DB User details.
# Used for application connectivity to database.
Expand Down Expand Up @@ -75,14 +75,14 @@ spec:
apiGroup: ingress.oraclecloud.com
kind: IngressClassParameters
name: native-ic-params
%{ if deploy_buildkit ~}
%{ if deploy_buildkit && deploy_optimizer ~}
---
# Builds and pushes application images to container registry.
# Uses instance principles.
apiVersion: batch/v1
kind: Job
metadata:
name: buildkit
name: optimizer-buildkit
namespace: ${label}
spec:
ttlSecondsAfterFinished: 300
Expand All @@ -97,11 +97,11 @@ spec:
- -c
- |
if [ "${optimizer_version}" = "Experimental" ]; then
echo "Downloading Code from MAIN branch"
wget -qO- https://github.com/oracle/ai-optimizer/archive/refs/heads/main.tar.gz \
| tar -xz -C /workspace ai-optimizer-main/src ai-optimizer-main/pyproject.toml --strip-components=1
mv /workspace/src/* /workspace/src/.* /workspace/ 2>/dev/null || true
rm -rf /workspace/src
| tar -xz -C /workspace --strip-components=1 ai-optimizer-main/src ai-optimizer-main/pyproject.toml
else
echo "Downloading Code from LATEST release"
wget -qO- https://github.com/oracle/ai-optimizer/releases/latest/download/ai-optimizer-src.tar.gz \
| tar -xz -C /workspace
fi
Expand All @@ -118,7 +118,7 @@ spec:
- -c
- |
RETRY_COUNT=0
REPO_PATH=$(echo "${repository_client}" | cut -d'/' -f2-)
REPO_PATH=$(echo "${optimizer_repository_client}" | cut -d'/' -f2-)

while [ $RETRY_COUNT -lt 10 ]; do
RETRY_COUNT=$((RETRY_COUNT + 1))
Expand All @@ -130,7 +130,7 @@ spec:
chown 1000:1000 /docker-config/config.json

HTTP_STATUS=$(oci raw-request --http-method GET \
--target-uri "https://${repository_host}/v2/$REPO_PATH/tags/list" \
--target-uri "https://${repository_host}/v2/$REPO_PATH/manifests/latest" \
--request-headers "{\"Authorization\": \"Bearer $TOKEN\"}" 2>/dev/null | jq -r '.status' || echo "000")

HTTP_CODE=$(echo "$HTTP_STATUS" | cut -d' ' -f1)
Expand Down Expand Up @@ -168,8 +168,8 @@ spec:
--progress plain \
--frontend dockerfile.v0 \
--local context=/workspace \
--local dockerfile=/workspace/client \
--output type=image,name=${repository_client}:latest,push=true
--local dockerfile=/workspace/src/client \
--output type=image,name=${optimizer_repository_client}:latest,push=true
securityContext:
seccompProfile:
type: Unconfined
Expand Down Expand Up @@ -201,8 +201,8 @@ spec:
--progress plain \
--frontend dockerfile.v0 \
--local context=/workspace \
--local dockerfile=/workspace/server \
--output type=image,name=${repository_server}:latest,push=true
--local dockerfile=/workspace/src/server \
--output type=image,name=${optimizer_repository_server}:latest,push=true
securityContext:
seccompProfile:
type: Unconfined
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

global:
api:
secretName: "${label}-api-key"
secretName: "${label}-optimizer-api-key"

# -- API Server configuration
server:
image:
repository: ${repository_server}
repository: ${optimizer_repository_server}
tag: "latest"
pullPolicy: Always

Expand Down Expand Up @@ -48,7 +48,7 @@ server:
client:
enable: true
image:
repository: ${repository_client}
repository: ${optimizer_repository_client}
tag: "latest"
pullPolicy: Always

Expand Down
5 changes: 5 additions & 0 deletions opentofu/modules/kubernetes/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,9 @@ variable "byo_ocir_url" {

variable "optimizer_version" {
type = string
}

variable "deploy_optimizer" {
type = bool
default = true
}
Loading
Loading