Merge pull request #4 from QGreenland-Net/run-parsl-on-adc-cluster

WIP set up process to run the Parsl script as a k8s Job
QGreenland-Net · Apr 15, 2024 · 53dd96a · 53dd96a
2 parents ded1760 + 5919362
commit 53dd96a
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 42 deletions.
diff --git a/.github/workflows/build-and-publish-container-image.yml b/.github/workflows/build-and-publish-container-image.yml
@@ -0,0 +1,47 @@
+name: "Build and publish container image"
+
+on:
+  push:
+    paths:
+      - "Dockerfile"
+      - "environment.yml"
+    branches:
+      - "main"
+    tags:
+      - "v[0-9]+.[0-9]+.[0-9]+*"
+
+
+jobs:
+
+  build-and-release-image:
+    name: "Build and release container image"
+    runs-on: "ubuntu-latest"
+    env:
+      # IMAGE_NAME: "${{ github.repo_name_or_something_like_that }}"
+      # NOTE: It's important that the image name matches org name / repo name.
+      # TODO: Calculate image name?
+      IMAGE_NAME: "qgreenland-net/parsl-exploration"
+      # GitHub Actions expressions don't have great conditional support, so
+      # writing a ternary expression looks a lot like bash. In Python, this
+      # would read as:
+      #     github.ref_name if github.ref_type == 'tag' else 'latest'
+      #     https://docs.github.com/en/actions/learn-github-actions/expressions
+      IMAGE_TAG: "${{ github.ref_type == 'tag' && github.ref_name || 'latest' }}"
+    steps:
+      - name: "Check out repository"
+        uses: "actions/checkout@v3"
+
+      - name: "Build container image"
+        run: |
+          docker build --tag "ghcr.io/${IMAGE_NAME}:${IMAGE_TAG}" .
+
+      - name: "GHCR login"
+        uses: "docker/login-action@v2"
+        with:
+          registry: "ghcr.io"
+          username: "${{ github.repository_owner }}"
+          password: "${{ secrets.GITHUB_TOKEN }}"
+
+      - name: "Push to GHCR"
+        run: |
+          docker push "ghcr.io/${IMAGE_NAME}:${IMAGE_TAG}"
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,8 @@
+FROM mambaorg/micromamba:1.5.8 AS micromamba
+
+COPY --chown=$MAMBA_USER:$MAMBA_USER . .
+
+RUN micromamba install --yes --name "base" --file "environment.yml"
+RUN micromamba clean --all --yes
+
+# ENV PATH "/opt/conda/bin:${PATH}"
diff --git a/README.md b/README.md
@@ -59,11 +59,8 @@ workers need to be able to connect back to the host running the Parsl program. I
 behind a firewall you don't control, this may not be possible!
 
 The workaround we're using is to submit a Kubernetes Job that runs the Parsl init
-program from a ConfigMap. See `hello-world-job.yml` for a generic example of this.
-
-Run it with `kubectl apply -f hello-world-job.yml`.
-
-We haven't yet got this working with Parsl, but that's the next step!
+program from a ConfigMap. See `run-on-remote-cluster.sh` and `job.yml` for an
+example of this.
 
 
 ## Troubleshooting

diff --git a/hello-world-job.yml → job.yml b/hello-world-job.yml → job.yml
@@ -6,6 +6,8 @@ spec:
   # TODO: when completions is 1, parallelism must be 1, but do we need to specify it?
   parallelism: 1
   completions: 1
+  # TODO: Supported in k8s 1.23, but ADC has 1.22; this would be nice :)
+  # ttlSecondsAfterFinished: 60
   template:
     metadata:
       name: "parsl-init"
@@ -14,38 +16,22 @@ spec:
       #     MountVolume.SetUp failed for volume "kube-api-access-xxxxx" : object "qgnet"/"kube-root-ca.crt" not registered
       automountServiceAccountToken: false
       volumes:
-        - name: "parsl-init-scripts-volume"
+        - name: "parsl-init-script-volume"
           configMap:
-            name: "parsl-init-scripts"
+            name: "parsl-init-script"
       containers:
         - name: "parsl-init"
           image: "python"
           volumeMounts:
-            - mountPath: "/parsl-init-scripts"
-              name: "parsl-init-scripts-volume"
+            - mountPath: "/parsl-init-script"
+              name: "parsl-init-script-volume"
           env:
             # TODO: Do we need this?
             - name: "HOME"
               value: "/tmp"
           command:
-            - "python"
-            - "/parsl-init-scripts/run.py"  # Filename from ConfigMap
+            - "bash"
+            - "-c"
+            # TODO: Bake parsl into an image
+            - "pip install parsl && python /parsl-init-script/run.py"  # << Filename from ConfigMap
       restartPolicy: "Never"
-
----
-
-apiVersion: "v1"
-# TODO: WHY? Just express ConfigMap alone at the top level?
-kind: "List"
-items:
-- apiVersion: "v1"
-  kind: "ConfigMap"
-  data:
-    run.py: |
-      print("I'm the Python script (run.py)!")
-      print("Hello world :|")
-  metadata:
-    creationTimestamp: null
-    name: "parsl-init-scripts"
-# TODO: WHY? Omit?
-metadata: {}
diff --git a/run-on-remote-cluster.sh b/run-on-remote-cluster.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Send our Parsl init script to the cluster. This will update the ConfigMap if
+# there are any changes. Note that "age" represents the time since the
+# ConfigMap was created, not since it was last updated.
+kubectl create configmap parsl-init-script --from-file run.py \
+    -o yaml --dry-run=client \
+    | kubectl apply -f -
+
+# Submit a "Job" to the cluster which runs our script
+# TODO: Should we delete any pre-existing job? We're manually doing `kubectl delete` now.
+kubectl apply -f job.yml
+
+
+# TODO: Can we also attach to monitor `kubectl describe job` or something?
diff --git a/run.py b/run.py
@@ -1,4 +1,9 @@
-"""Example parsl workflow to be executed on kubernetes."""
+"""Example parsl workflow to be executed on kubernetes.
+
+TODO:
+
+* Less printing more logging
+"""
 
 import subprocess
 
@@ -28,16 +33,8 @@ def get_k8s_context() -> str:
 
     context = result.stdout.decode("utf8").strip()
 
-    assert context in ("rancher-desktop", "dev-qgnet")
-    if context == "dev-qgnet":
-        raise NotImplementedError(
-            "Running on the 'dev-qgnet' namespace fails due to container"
-            " communication issues. Symptom: This script hangs. Remove this check from"
-            " the code to re-test."
-        )
-
     print(f"Detected context: {context}")
-
+    assert context in ("rancher-desktop", "dev-qgnet")
     return context
 
 
@@ -62,13 +59,14 @@ def get_parsl_config():
                 cores_per_worker=1,
                 max_workers_per_node=1,
                 worker_logdir_root="/tmp/",
-                # Address for the pod worker to connect back
-                address=address_by_route(),
+                # Address for the pod worker to connect back to the "interchange"
+                address="8.44.147.13",
+                # address=address_by_route(),
                 # https://parsl.readthedocs.io/en/stable/stubs/parsl.providers.KubernetesProvider.html#parsl.providers.KubernetesProvider
                 provider=KubernetesProvider(
                     namespace=k8s_namespace,
                     # Docker image url to use for pods
-                    image="python",
+                    image="gchr.io/mbjones/k8sparsl:0.3",
                     # Command to be run upon pod start, such as:
                     # "module load Anaconda; source activate parsl_env".
                     # or "pip install parsl"