Skip to content

Commit

Permalink
kfpint: switch to using minikube cluster (pytorch#696)
Browse files Browse the repository at this point in the history
  • Loading branch information
d4l3k authored Feb 26, 2023
1 parent 210b261 commit 8f50349
Show file tree
Hide file tree
Showing 11 changed files with 91 additions and 59 deletions.
48 changes: 9 additions & 39 deletions .github/workflows/kfp-integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,57 +8,27 @@ on:

jobs:
kfp-launch:
runs-on: ubuntu-20.04
permissions:
id-token: write
contents: read
runs-on: linux.20_04.4x
steps:
- name: Install kubectl
# More info: https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/
run: |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x kubectl
mkdir -p ~/.local/bin/kubectl
mv ./kubectl ~/.local/bin/kubectl
export PATH=$PATH:~/.local/bin/kubectl
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1.6.1
with:
aws-region: us-west-2
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
role-session-name: github-torchx
continue-on-error: true
- name: Configure Kube Config
env:
AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
run: |
set -eux
if [ -n "$AWS_ROLE_ARN" ]; then
aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
fi
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.8
architecture: x64
- name: Checkout TorchX
uses: actions/checkout@v2
- name: Configure Docker
env:
AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
run: |
set -eux
if [ -n "$AWS_ROLE_ARN" ]; then
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
fi
- name: Install dependencies
run: |
set -eux
pip install -r dev-requirements.txt
python setup.py install
- name: Start Kubernetes
run: |
scripts/setup_minikube.sh
scripts/setup_kfp.sh
- name: Run KFP Integration Tests
env:
KFP_NAMESPACE: ${{ secrets.KFP_NAMESPACE }}
INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
run: scripts/kfpint.py
KFP_NAMESPACE: kubeflow
INTEGRATION_TEST_STORAGE: torchx_minio://torchx/tests
run: scripts/kfpint.py --container_repo localhost:5000/torchx
6 changes: 3 additions & 3 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
aiobotocore==2.1.0
aiobotocore==2.4.2
ax-platform[mysql]==0.2.3
black==22.3.0
boto3==1.20.24
boto3==1.24.59
captum>=0.4.0
flake8==3.9.0
fsspec[s3]==2022.1.0
fsspec[s3]==2023.1.0
google-api-core
google-cloud-batch>=0.5.0
google-cloud-logging>=3.0.0
Expand Down
4 changes: 2 additions & 2 deletions resources/kfp_volcano_role_binding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: kfp-volcano
namespace: torchx-dev
namespace: kubeflow
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kfp-volcano
subjects:
- kind: ServiceAccount
name: pipeline-runner
namespace: torchx-dev
namespace: kubeflow
3 changes: 2 additions & 1 deletion scripts/kfpint.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ async def exec_job() -> None:
help="if specified save the build to path and exit",
action="store_true",
)
parser.add_argument("--container_repo", type=str)
args = parser.parse_args()

with path_or_tmp(args.path) as path:
Expand All @@ -245,7 +246,7 @@ async def exec_job() -> None:
dist_pipeline_file = os.path.join(path, "dist_pipeline.yaml")
build = build_images()
try:
push_images(build)
push_images(build, container_repo=args.container_repo)
except MissingEnvError as e:
print(f"Missing environments, only building: {e}")
return
Expand Down
9 changes: 9 additions & 0 deletions scripts/setup_kfp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

set -eux

export PIPELINE_VERSION=1.8.5
kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/cluster-scoped-resources?ref=$PIPELINE_VERSION"
kubectl wait --for condition=established --timeout=60s crd/applications.app.k8s.io
kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/env/dev?ref=$PIPELINE_VERSION"
kubectl apply -f resources/kfp_volcano_role_binding.yaml
15 changes: 15 additions & 0 deletions scripts/setup_minikube.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,21 @@ set -eux
minikube delete
minikube start --driver=docker --cpus=max --memory=max --nodes=2
minikube addons enable registry

# setup multi node volumes
# https://github.com/kubernetes/minikube/issues/12360#issuecomment-1430243861
minikube addons disable storage-provisioner
minikube addons disable default-storageclass
minikube addons enable volumesnapshots
minikube addons enable csi-hostpath-driver
kubectl patch storageclass csi-hostpath-sc -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'

# install volcano
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml

# create namespace
kubectl create namespace torchx-dev

# portforwarding
kubectl port-forward --namespace kube-system service/registry 5000:80 &

3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ def get_nightly_version():
"torchx.tracker": [
"fsspec=torchx.tracker.backend.fsspec:create",
],
"fsspec.specs": [
"torchx_minio=torchx.test.minio.MinioFS",
],
},
extras_require={
"gcp_batch": [
Expand Down
27 changes: 17 additions & 10 deletions torchx/examples/pipelines/kfp/advanced_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,12 @@

processed_data_path: str = os.path.join(args.output_path, "processed")
datapreproc_app: specs.AppDef = utils_python(
*("--output_path", processed_data_path, "--input_path", data_path),
"--output_path",
processed_data_path,
"--input_path",
data_path,
"--limit",
"100",
image=args.image,
m="torchx.examples.apps.datapreproc.datapreproc",
cpu=1,
Expand Down Expand Up @@ -253,15 +258,17 @@ def pipeline() -> None:
trainer.container.set_tty()
trainer.after(datapreproc)

serve = container_from_app(serve_app)
serve.container.set_tty()
serve.after(trainer)

# Serve and interpret only require the trained model so we can run them
# in parallel to each other.
interpret = container_from_app(interpret_app)
interpret.container.set_tty()
interpret.after(trainer)
if False:
serve = container_from_app(serve_app)
serve.container.set_tty()
serve.after(trainer)

if False:
# Serve and interpret only require the trained model so we can run them
# in parallel to each other.
interpret = container_from_app(interpret_app)
interpret.container.set_tty()
interpret.after(trainer)


kfp.compiler.Compiler().compile(
Expand Down
2 changes: 1 addition & 1 deletion torchx/examples/pipelines/kfp/dist_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def pipeline() -> None:
# To convert the TorchX AppDef into a KFP container we use
# the resource_from_app adapter. This takes generates a KFP Kubernetes
# resource operator definition from the TorchX app def and instantiates it.
echo_container: kfp.dsl.BaseOp = resource_from_app(echo_app, queue="test")
echo_container: kfp.dsl.BaseOp = resource_from_app(echo_app, queue="default")


# %%
Expand Down
6 changes: 3 additions & 3 deletions torchx/runtime/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
WORKDIR /app

# upgrade pip to 22.x+ which has a faster dependency resolver
RUN pip install --upgrade pip
RUN pip install --upgrade pip wheel --no-cache-dir

# copy requirements early so we don't have to redownload dependencies on code
# changes
COPY dev-requirements.txt /app
RUN pip install -r dev-requirements.txt
RUN pip install -r dev-requirements.txt --no-cache-dir

COPY . /app

RUN python setup.py install
RUN pip install . --no-cache-dir
27 changes: 27 additions & 0 deletions torchx/test/minio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import s3fs


class MinioFS(s3fs.S3FileSystem):
"""
A test FS that uses a MinIO filesystem on top of s3fs for TorchX integration
tests in minikube.
"""

protocol = ["torchx_minio", "s3", "s3a"]

def __init__(self, *args: object, **kwargs: object) -> None:
super().__init__(
*args,
key="minio",
secret="minio123",
client_kwargs={
"endpoint_url": "http://minio-service:9000",
},
**kwargs
)

0 comments on commit 8f50349

Please sign in to comment.