Skip to content

Commit

Permalink
feat(controller): support new job/dataset/datastore/runtime (star-wha…
Browse files Browse the repository at this point in the history
…le#969)

* tune job and task impl

* feat(controller): add job token & storage env for job  (star-whale#940)

job token env;s3 env

* add project info for task

* add env for k8s core container

* test(e2e): modify e2e deployment from docker compose to minikube (star-whale#948)

e2e test

* feat(controller): storage prefix env (star-whale#941)

swds path prefix injection

Co-authored-by: star <15031259256@163.com>

* fix(controller): fix dataset upload error (star-whale#958)

* error fix

* fix protocol un match error

Co-authored-by: star <15031259256@163.com>

* change task container entrypoint

* fix(controller): fix sql error (star-whale#977)

fix some error

* add update last step ID mapper function

* bug fix

* add new console ui resource

* tune sql script version

* fix select task sql bug

* remove job list condition at debugging

* fix  unit test error

Co-authored-by: gaoxinxing <15931259256@163.com>
Co-authored-by: renyanda <781905270@qq.com>
  • Loading branch information
3 people authored and dreamlandliu committed Aug 25, 2022
1 parent 3995e0d commit 27844d6
Show file tree
Hide file tree
Showing 60 changed files with 640 additions and 460 deletions.
6 changes: 1 addition & 5 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,13 @@ on:
env:
PYPI_RELEASE_VERSION: 100.0.0
RELEASE_VERSION: 0.0.0-dev
NEXUS_HOSTNAME: host.nexus
NEXUS_IMAGE: sonatype/nexus3:3.40.1
NEXUS_USER_NAME: admin
NEXUS_USER_PWD: admin123
PORT_NEXUS: 8081
PORT_CONTROLLER: 8082
PORT_NEXUS_DOCKER: 9001
IP_DOCKER_COMPOSE_BRIDGE: 172.18.0.1
SW_IMAGE_REPO: host.nexus:9001
PORT_NEXUS_DOCKER: 8083
IP_DOCKER_BRIDGE: 172.17.0.1
IP_DOCKER_COMPOSE_BRIDGE_RANGE: 172.0.0.0/8
REPO_NAME_DOCKER: docker-hosted
REPO_NAME_PYPI: pypi-hosted

Expand Down
3 changes: 1 addition & 2 deletions docker/Dockerfile.base_server
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@ ENV LANG=en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
ENV TZ=Etc/UTC

# Install os dependence\OpenJDK-11\nvidia-smi
# Install os dependence\OpenJDK-11
RUN apt-get update \
&& apt-get install -y openjdk-11-jdk \
&& apt-get install -y --no-install-recommends nvidia-driver-${NV_DRIVER_VERSION} \
&& apt-get install -y --no-install-recommends \
wget locales make cmake build-essential software-properties-common curl sudo ca-certificates apt-transport-https iputils-ping net-tools openssh-server net-tools gcc-aarch64-linux-gnu \
zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev shellcheck git apt-utils gpg-agent \
Expand Down
2 changes: 1 addition & 1 deletion docker/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ release-nodejs:
$(call push-image,nodejs,${FIXED_VERSION_NODEJS_IMAGE})

build-console:
docker run --rm -it -v $(ROOT_DIR)console:/app -w /app ${DH_NODEJS_IMAGE} bash -c "yarn install && yarn build"
docker run --rm -v $(ROOT_DIR)console:/app -w /app ${DH_NODEJS_IMAGE} bash -c "yarn install && yarn build"

build-jar:
docker volume create --name maven-repo; docker run --rm -it -v maven-repo:/root/.m2 -v $(ROOT_DIR):/app -w /app maven:3.8.5-openjdk-11 mvn clean package -f server/pom.xml -DskipTests;
4 changes: 3 additions & 1 deletion docker/charts/templates/NOTES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ Chart Name: {{ .Chart.Name }}
Chart Version: {{ .Chart.Version }}
App Version: {{ .Chart.AppVersion }}
Starwhale Image:
- server: {{ .Values.image.registry }}/{{ .Values.image.org }}/{{ .Values.image.server.repo }}:{{ .Chart.AppVersion }}
- server: {{ .Values.image.registry }}/{{ .Values.image.org }}/{{ .Values.image.server.repo }}:{{ .Values.image.tag | default .Chart.AppVersion }}
Runtime default Image:
- runtime image: {{ .Values.image.registry }}/{{ .Values.image.org }}/{{ .Values.image.base.repo }}:{{ .Values.image.tag | default .Chart.AppVersion }}

******************************************
{{- if .Values.ingress.enabled }}
Expand Down
4 changes: 3 additions & 1 deletion docker/charts/templates/controller-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
serviceAccountName: {{ include "common.names.fullname" . }}
containers:
- name: controller
image: "{{ .Values.image.registry }}/{{ .Values.image.org }}/{{ .Values.image.server.repo }}:{{ .Chart.AppVersion }}"
image: "{{ .Values.image.registry }}/{{ .Values.image.org }}/{{ .Values.image.server.repo }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: IfNotPresent
ports:
- containerPort: {{ .Values.controller.containerPort }}
Expand Down Expand Up @@ -80,6 +80,8 @@ spec:
value: "{{ .Values.controller.upload.maxFileSize }}"
- name: SW_STORAGE_PREFIX
value: "{{ .Release.Name }}"
- name: SW_RUNTIME_IMAGE_DEFAULT
value: "{{ .Values.image.registry }}/{{ .Values.image.org }}/{{ .Values.image.base.repo }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
- name: SW_TASK_SPLIT_SIZE
value: "{{ .Values.controller.taskSplitSize }}"
- name: SW_HOST_IP
Expand Down
6 changes: 6 additions & 0 deletions example/mnist/requirements-sw-lock.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy==1.21.0
torch
torchvision
Pillow
scikit-learn
starwhale
2 changes: 1 addition & 1 deletion scripts/e2e_test/copy_artifacts_to_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ if [[ ! -z ${DEBUG} ]]; then
set -x
fi

source $WORK_DIR/venv/bin/activate
source $WORK_DIR/.venv/bin/activate
if [ -z "$GITHUB_ACTION" ]; then
export SW_CLI_CONFIG="$LOCAL_DATA_DIR/config.yaml"
export SW_LOCAL_STORAGE=$LOCAL_DATA_DIR/data
Expand Down
154 changes: 75 additions & 79 deletions scripts/e2e_test/start_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,35 @@ if in_github_action; then
export SW_PYPI_EXTRA_INDEX_URL='https://pypi.org/simple'
else
SW_PYPI_EXTRA_INDEX_URL='https://pypi.doubanio.com/simple/'
export PARENT_CLEAN=true
export PARENT_CLEAN="${PARENT_CLEAN:=true}"
fi

declare_env() {
export PYPI_RELEASE_VERSION="${PYPI_RELEASE_VERSION:=100.0.0}"
export RELEASE_VERSION="${RELEASE_VERSION:=0.0.0-dev}"
export NEXUS_HOSTNAME="${NEXUS_HOSTNAME:=host.nexus}"
export NEXUS_HOSTNAME="${NEXUS_HOSTNAME:=host.minikube.internal}"
export NEXUS_IMAGE="${NEXUS_IMAGE:=sonatype/nexus3:3.40.1}"
export NEXUS_USER_NAME="${NEXUS_USER_NAME:=admin}"
export NEXUS_USER_PWD="${NEXUS_USER_PWD:=admin123}"
export PORT_NEXUS="${PORT_NEXUS:=8081}"
export PORT_CONTROLLER="${PORT_CONTROLLER:=8082}"
export PORT_NEXUS_DOCKER="${PORT_NEXUS_DOCKER:=9001}"
export IP_DOCKER_COMPOSE_BRIDGE="${IP_DOCKER_COMPOSE_BRIDGE:=172.18.0.1}"
export SW_IMAGE_REPO="${SW_IMAGE_REPO:=host.nexus:9001}"
export PORT_NEXUS_DOCKER="${PORT_NEXUS_DOCKER:=8083}"
export IP_MINIKUBE_BRIDGE="${IP_MINIKUBE_BRIDGE:=192.168.49.1}"
export SW_IMAGE_REPO="${SW_IMAGE_REPO:=host.minikube.internal:8083}"
export IP_DOCKER_BRIDGE="${IP_DOCKER_BRIDGE:=172.17.0.1}"
export IP_DOCKER_COMPOSE_BRIDGE_RANGE="${IP_DOCKER_COMPOSE_BRIDGE_RANGE:=172.0.0.0/8}"
export IP_MINIKUBE_BRIDGE_RANGE="${IP_MINIKUBE_BRIDGE_RANGE:=192.0.0.0/8}"
export REPO_NAME_DOCKER="${REPO_NAME_DOCKER:=docker-hosted}"
export REPO_NAME_PYPI="${REPO_NAME_PYPI:=pypi-hosted}"
export PYTHON_VERSION="${PYTHON_VERSION:=3.9}"
}

start_minikube() {
minikube start -p sw-e2e-test --insecure-registry "$IP_MINIKUBE_BRIDGE_RANGE"
minikube addons enable ingress -p sw-e2e-test
minikube addons enable ingress-dns -p sw-e2e-test
kubectl describe node
}

start_nexus() {
docker run -d --publish=$PORT_NEXUS:$PORT_NEXUS --publish=$PORT_NEXUS_DOCKER:$PORT_NEXUS_DOCKER --name nexus -e NEXUS_SECURITY_RANDOMPASSWORD=false $NEXUS_IMAGE
sudo cp /etc/hosts /etc/hosts.bak_e2e
Expand All @@ -59,45 +66,22 @@ build_swcli() {
popd
}

build_console() {
pushd ../../docker
make build-console
popd
}

build_server_image() {
pushd ../../server
make build-package
pushd ../docker
docker build -t server -f Dockerfile.server .
docker tag server $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/server
override_docker_compose
popd
pushd ../../docker
docker build -t server -f Dockerfile.server .
docker tag server $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/star-whale/server:$PYPI_RELEASE_VERSION
popd
}

override_docker_compose() {
cp compose/compose.override.yaml compose/compose.override.yaml.bak_e2e
cat > compose/compose.override.yaml << EOF
services:
controller:
image: $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/server
agent:
image: $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/server
environment:
- SW_PYPI_INDEX_URL=http://$IP_DOCKER_COMPOSE_BRIDGE:$PORT_NEXUS/$REPO_NAME_PYPI/simple
- SW_PYPI_EXTRA_INDEX_URL=$SW_PYPI_EXTRA_INDEX_URL
- SW_PYPI_TRUSTED_HOST=$IP_DOCKER_COMPOSE_BRIDGE
- SW_TASK_USE_HOST_NETWORK=1
extra_hosts:
- $NEXUS_HOSTNAME:$IP_DOCKER_COMPOSE_BRIDGE
taskset:
volumes:
- agent_data:/opt/starwhale
- taskset_dind_data:/var/lib/docker
- /tmp/docker-daemon.json:/etc/docker/daemon.json
extra_hosts:
- $NEXUS_HOSTNAME:$IP_DOCKER_COMPOSE_BRIDGE
EOF

}

overwrite_pypirc() {
if file_exists "$HOME/.pypirc" ; then
cp $HOME/.pypirc $HOME/.pypirc.bak_e2e
Expand Down Expand Up @@ -138,11 +122,6 @@ EOF
cat $HOME/.pip/pip.conf
}

create_daemon_json_for_taskset() {
echo "{\"hosts\":[\"tcp://0.0.0.0:2376\",\"unix:///var/run/docker.sock\"],\"insecure-registries\":[\"10.0.0.0/8\",\"127.0.0.0/8\",\"$IP_DOCKER_COMPOSE_BRIDGE_RANGE\",\"192.0.0.0/8\"],\"live-restore\":true,\"max-concurrent-downloads\":20,\"max-concurrent-uploads\":20,\"registry-mirrors\":[\"http://$NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER\"],\"mtu\":1450,\"runtimes\":{\"nvidia\":{\"path\":\"nvidia-container-runtime\",\"runtimeArgs\":[]}},\"storage-driver\":\"overlay2\"}" > /tmp/docker-daemon.json
}


create_service_check_file() {
cp service_wait.sh /tmp/service_wait.sh
}
Expand All @@ -167,28 +146,42 @@ upload_pypi_to_nexus() {
buid_runtime_image() {
pushd ../../docker
docker build -t starwhale -f Dockerfile.starwhale --build-arg ENABLE_E2E_TEST_PYPI_REPO=1 --build-arg PORT_NEXUS=$PORT_NEXUS --build-arg LOCAL_PYPI_HOSTNAME=$IP_DOCKER_BRIDGE --build-arg SW_VERSION=$PYPI_RELEASE_VERSION .
docker tag starwhale $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/starwhale:$PYPI_RELEASE_VERSION
docker tag starwhale $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/star-whale/starwhale:$PYPI_RELEASE_VERSION
popd
}

push_images_to_nexus() {
docker login http://$NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER -u $NEXUS_USER_NAME -p $NEXUS_USER_PWD
docker push $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/server
docker push $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/starwhale:$PYPI_RELEASE_VERSION
docker push $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/star-whale/server:$PYPI_RELEASE_VERSION
docker push $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/star-whale/starwhale:$PYPI_RELEASE_VERSION
}

start_docker_compose() {
pushd ../../docker/compose
if ! type "$docker-compose" > /dev/null; then
docker compose up -d
else
docker-compose up -d
fi
start_starwhale() {
pushd ../../docker/charts
helm upgrade --install starwhale ./ --namespace starwhale --create-namespace --set "resources.controller.requests.memory=4G,resources.controller.requests.cpu=1000m,resources.controller.limits.cpu=1000m,minio.resources.requests.cpu=1000m,minio.resources.limits.cpu=2000m,controller.taskSplitSize=1,minikube.enabled=true,image.registry=$NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER,image.tag=$PYPI_RELEASE_VERSION,mirror.pypi.indexUrl= http://$NEXUS_HOSTNAME:$PORT_NEXUS/repository/$REPO_NAME_PYPI/simple,mirror.pypi.extraIndexUrl=$SW_PYPI_EXTRA_INDEX_URL,mirror.pypi.trustedHost=$NEXUS_HOSTNAME"
popd
}

check_controller_service() {
chmod u+x /tmp/service_wait.sh && /tmp/service_wait.sh http://$NEXUS_HOSTNAME:$PORT_CONTROLLER
while true
do
started=`kubectl get pod -l starwhale.ai/role=controller -n starwhale -o json| jq -r '.items[0].status.containerStatuses[0].started'`
if [[ "$started" == "true" ]]; then
echo "controller started"
break
else
echo "controller is starting"
kubectl get pods --namespace starwhale
# kubectl get pod -l starwhale.ai/role=controller -n starwhale -o json| jq -r '.items[0].status'
# ready=`kubectl get pod -l starwhale.ai/role=controller -n starwhale -o json| jq -r '.items[0].status.phase'`
# if [[ "$ready" == "Running" ]]; then
# name=`kubectl get pod -l starwhale.ai/role=controller -n starwhale -o json| jq -r '.items[0].metadata.name'`
# kubectl describe pod $name --namespace starwhale
# fi
fi
sleep 15
done
nohup kubectl port-forward --namespace starwhale svc/starwhale-controller 8082:8082 &
}

standalone_test() {
Expand All @@ -199,16 +192,20 @@ standalone_test() {
rm -rf venv*
pushd ../
scripts/run_demo.sh
export WORK_DIR=`cat WORK_DIR`
export LOCAL_DATA_DIR=`cat LOCAL_DATA_DIR`
scripts/e2e_test/copy_artifacts_to_server.sh 127.0.0.1:$PORT_CONTROLLER
scripts/e2e_test/test_job_run.sh 127.0.0.1:$PORT_CONTROLLER
popd
popd

}

api_test() {
pushd ../../
export WORK_DIR=`cat WORK_DIR`
if ! in_github_action; then
export LOCAL_DATA_DIR=`cat LOCAL_DATA_DIR`
fi
scripts/e2e_test/copy_artifacts_to_server.sh 127.0.0.1:$PORT_CONTROLLER
scripts/e2e_test/test_job_run.sh 127.0.0.1:$PORT_CONTROLLER
popd
pushd ../apitest/pytest
python3 -m pip install -r requirements.txt
pytest --host 127.0.0.1 --port $PORT_CONTROLLER
Expand All @@ -217,25 +214,17 @@ api_test() {

restore_env() {
rm -rf venve2e
mv ~/.pypirc.bak_e2e ~/.pypirc
mv ~/.pip/pip.conf.bak_e2e ~/.pip/pip.conf
sudo mv /etc/hosts.bak_e2e /etc/hosts
rm /tmp/service_wait.sh
docker kill nexus
docker container rm nexus
docker image rm starwhale
docker image rm $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/starwhale:$PYPI_RELEASE_VERSION
docker image rm $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/server
docker image rm $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/star-whale/starwhale:$PYPI_RELEASE_VERSION
docker image rm $NEXUS_HOSTNAME:$PORT_NEXUS_DOCKER/star-whale/server:$PYPI_RELEASE_VERSION
docker image rm server
mv ~/.pypirc.bak_e2e ~/.pypirc
mv ~/.pip/pip.conf.bak_e2e ~/.pip/pip.conf
rm /tmp/service_wait.sh
script_dir="$(dirname -- "$(readlink -f "${BASH_SOURCE[0]}")")"
cd $script_dir/../../docker/compose
mv compose.override.yaml.bak_e2e compose.override.yaml
dc=`which docker-compose`
if [ -z $dc ]; then
docker compose down
else
docker-compose down
fi
minikube delete -p sw-e2e-test
cd $script_dir/../../
WORK_DIR=`cat WORK_DIR`
if test -n $WORK_DIR ; then
Expand All @@ -245,28 +234,35 @@ restore_env() {
rm LOCAL_DATA_DIR
echo 'cleanup'
}
if ! in_github_action; then
trap restore_env EXIT
fi

main() {
declare_env
if ! in_github_action; then
trap restore_env EXIT
fi
start_nexus
start_minikube
overwrite_pip_config
overwrite_pypirc
build_swcli
build_console
build_server_image
create_daemon_json_for_taskset
create_service_check_file
check_nexus_service
create_repository_in_nexus
upload_pypi_to_nexus
buid_runtime_image
push_images_to_nexus
start_docker_compose
check_controller_service
start_starwhale
standalone_test
check_controller_service
api_test
}

main
declare_env
if test -z $1; then
main
else
$1
fi

16 changes: 12 additions & 4 deletions scripts/e2e_test/test_job_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ job_id=`curl -X 'POST' \
"datasetVersionUrls": "1",
"runtimeVersionUrl": "1",
"device": "1",
"deviceAmount": 1,
"deviceAmount": 1000,
"comment": "string"
}' | jq -r '.data'`

Expand All @@ -39,10 +39,10 @@ fi

while true
do
curl -X 'GET' \
if curl -X 'GET' \
"http://$1/api/v1/project/1/job/$job_id" \
-H 'accept: application/json' \
-H "$auth_header" | jq -r '.data.jobStatus' > jobStatus
-H "$auth_header" | jq -r '.data.jobStatus' > jobStatus ; then echo "8082 well"; else kubectl logs --tail=10 -l starwhale.ai/role=controller -n starwhale; continue; fi
job_status=`cat jobStatus`
if [ "$job_status" == "null" ] ; then
echo "Error! job_status id is null" 1>&2
Expand All @@ -56,7 +56,15 @@ do
break
else
echo "job status for " "$job_id" "is" "$job_status"
sleep 1
# kubectl logs --tail=10 -l job-name=1 -n starwhale -c data-provider
# kubectl logs --tail=10 -l job-name=1 -n starwhale -c untar
# kubectl logs --tail=10 -l job-name=1 -n starwhale -c worker
# kubectl logs --tail=10 -l job-name=1 -n starwhale -c result-uploader

# kubectl logs -f -l starwhale.ai/role=controller -n starwhale
# kubectl describe pod -l "job-name in (1,2,3,4,5,6,7,8,9,10)" -n starwhale
# kubectl describe node
sleep 10
fi
done

Expand Down
3 changes: 2 additions & 1 deletion scripts/run_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ fi
echo $WORK_DIR > WORK_DIR

finish() {
if ! in_github_action && test -z $PARENT_CLEAN ; then
if ! in_github_action && test -z "$PARENT_CLEAN" ; then
echo 'cleanup work dir '"$WORK_DIR"
rm -rf "$WORK_DIR"
fi
echo 'cleanup'
Expand Down
Loading

0 comments on commit 27844d6

Please sign in to comment.