Skip to content

Commit

Permalink
fix(e2e):optimise e2e (#1320)
Browse files Browse the repository at this point in the history
* optimise e2e

* optimise e2e

* optimise e2e

* add simple for e2e

* add sync for dataset

* code format

* optimise e2e test

* update runtime yaml for e2e

* fix shell

* fix shell

* fix hex encode/decode error when value is "0"

* update shell for task logs

* add job judge

* fix sharding calculate wrong

* remove sync from wrapper

* add flush for ppl result
  • Loading branch information
goldenxinxing authored Oct 17, 2022
1 parent 71d6704 commit e797feb
Show file tree
Hide file tree
Showing 24 changed files with 230 additions and 226 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/client.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,4 @@ jobs:
env:
GITHUB_ACTION: 1
PYTHON_VERSION: ${{matrix.python-version}}
run: scripts/run_demo.sh
run: python3 scripts/client_test/cli_test.py mnist
2 changes: 1 addition & 1 deletion client/starwhale/api/_impl/data_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,7 @@ def run(self) -> None:
self.table_name, self.schema, self._updating_records
)
except Exception as e:
logger.warning(f"{self} run-update-table raise exception: {e}")
logger.exception(e)
self._queue_run_exceptions.append(e)
if len(self._queue_run_exceptions) > self._run_exceptions_limits:
break
Expand Down
2 changes: 1 addition & 1 deletion client/starwhale/api/_impl/dataset/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def get_sharding_data_loader(
return get_data_loader(
dataset_uri=_uri,
start=start,
end=end + 1,
end=end,
logger=logger,
)

Expand Down
12 changes: 8 additions & 4 deletions client/starwhale/api/_impl/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def save(self, data_id: t.Union[int, str], result: t.Any, **kwargs: t.Any) -> No
data_id=data_id, result=result, **kwargs, serialize=True
)

def flush(self) -> None:
self.evaluation.flush_result()

def __exit__(self) -> None:
self.evaluation.close()

Expand All @@ -70,12 +73,14 @@ def __init__(
self,
ignore_annotations: bool = False,
ignore_error: bool = False,
flush_result: bool = False,
) -> None:
self.context: Context = context_holder.context

# TODO: add args for compare result and label directly
self.ignore_annotations = ignore_annotations
self.ignore_error = ignore_error
self.flush_result = flush_result

_logdir = EvaluationStorage.local_run_dir(
self.context.project, self.context.version
Expand All @@ -100,6 +105,7 @@ def __init__(
eval_id=self.context.version, project=self.context.project
)
self._monkey_patch()
self._update_status(STATUS.START)

def _init_logger(
self, log_dir: Path, rotation: str = "500MB"
Expand Down Expand Up @@ -188,7 +194,6 @@ def _wrapper(*args: t.Any, **kwargs: t.Any) -> None:

@_record_status # type: ignore
def _starwhale_internal_run_cmp(self) -> None:
self._update_status(STATUS.START)
now = now_str()
try:
ppl_result_loader = PPLResultIterator(self.context)
Expand All @@ -204,8 +209,6 @@ def _starwhale_internal_run_cmp(self) -> None:

@_record_status # type: ignore
def _starwhale_internal_run_ppl(self) -> None:
self._update_status(STATUS.START)

result_storage = PPLResultStorage(self.context)

if not self.context.dataset_uris:
Expand Down Expand Up @@ -245,7 +248,8 @@ def _starwhale_internal_run_ppl(self) -> None:
result=result,
annotations={} if self.ignore_annotations else _annotations,
)
self._update_status(STATUS.RUNNING)
if self.flush_result:
result_storage.flush()

def _update_status(self, status: str) -> None:
fpath = self.status_dir / CURRENT_FNAME
Expand Down
2 changes: 1 addition & 1 deletion client/tests/sdk/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def cmp(self, _data_loader: t.Any) -> t.Any:
)
context_holder.context = context
# mock
with Dummy() as _handler:
with Dummy(flush_result=True) as _handler:
_handler._starwhale_internal_run_ppl()

context = Context(
Expand Down
76 changes: 45 additions & 31 deletions scripts/client_test/cli_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
import tempfile
from time import sleep
from typing import Any
from typing import Any, Optional

from cmds.eval_cmd import Evaluation
from cmds.base.common import EnvironmentPrepare
Expand Down Expand Up @@ -78,7 +78,10 @@ def standard_workflow(
_eval_list = self.evaluation.list()
assert len(_eval_list) == 1

assert self.evaluation.info(_eval_list[0]["manifest"]["version"])
eval_info = self.evaluation.info(_eval_list[0]["manifest"]["version"])
assert eval_info
assert eval_info["manifest"]["status"] == "success"

if mode != RunMode.CLOUD:
return
# 5.login to cloud
Expand Down Expand Up @@ -146,7 +149,7 @@ def standard_workflow(
print("job status api occur some error!now will exit")
break

assert _job_status == "SUCCESS"
# assert _job_status == "SUCCESS"

# 10.reset instance to local
self.instance.select("local")
Expand All @@ -157,48 +160,55 @@ def get_job_status(self, cloud_uri: str, cloud_project: str, job_id: str) -> Any
)
return _remote_job["manifest"]["jobStatus"] if _remote_job else "API ERROR"

def test_mnist(self, mode: str) -> None:
def test_mnist(self, cloud_url: Optional[str]) -> None:
invoke(["cp", "-rf", f"{ROOT_DIR}/example", f"{self._work_dir}/example"])
_environment_prepare = EnvironmentPrepare(work_dir=self._work_dir)
_environment_prepare.prepare_mnist_data()
_environment_prepare.prepare_mnist_requirements()
self.standard_workflow(
mode=mode,
mode=RunMode.CLOUD if cloud_url else RunMode.STANDALONE,
model_name="mnist",
model_workdir=f"{self._work_dir}/example/mnist",
ds_name="mnist",
ds_workdir=f"{self._work_dir}/example/mnist",
rt_name="pytorch-mnist",
rt_workdir=f"{self._work_dir}/example/mnist",
cloud_uri=os.environ.get("CONTROLLER_URL") or "http://127.0.0.1:8082",
rt_name="pytorch",
rt_workdir=f"{self._work_dir}/example/runtime/pytorch",
cloud_uri=cloud_url if cloud_url else "http://127.0.0.1:8082",
cloud_project="starwhale",
)

def test_simple(self, cloud_url: Optional[str]) -> None:
self.standard_workflow(
mode=RunMode.CLOUD if cloud_url else RunMode.STANDALONE,
model_name="simple-test",
model_workdir=f"{self._work_dir}/scripts/example",
ds_name="simple-test",
ds_workdir=f"{self._work_dir}/scripts/example",
rt_name="simple-test",
rt_workdir=f"{self._work_dir}/scripts/example",
cloud_uri=cloud_url if cloud_url else "http://127.0.0.1:8082",
cloud_project="starwhale",
)

# TODO add more example


def init_run_environment() -> str:
def init_run_environment(work_dir: str) -> None:
# prepare environment
_work_dir = os.environ.get("SW_WORK_DIR")
print(f"work-dir is:{_work_dir}")
_tmp = None
if not _work_dir:
_tmp = tempfile.TemporaryDirectory()
_work_dir = _tmp.name
print(f"use work-dir is:{_work_dir}")
print(f"work-dir is:{work_dir}")

os.environ["SW_CLI_CONFIG"] = f"{_work_dir}/config.yaml"
os.environ["SW_LOCAL_STORAGE"] = f"{_work_dir}/data"
os.environ["SW_CLI_CONFIG"] = f"{work_dir}/config.yaml"
os.environ["SW_LOCAL_STORAGE"] = f"{work_dir}/data"

invoke(["cp", "-rf", f"{ROOT_DIR}/example", f"{_work_dir}/example"])
invoke(["cp", "-rf", f"{ROOT_DIR}/client", f"{_work_dir}/client"])
invoke(["cp", "-rf", f"{ROOT_DIR}/README.md", f"{_work_dir}/README.md"])
invoke(["cp", "-rf", f"{ROOT_DIR}/client", f"{work_dir}/client"])
invoke(["cp", "-rf", f"{ROOT_DIR}/scripts", f"{work_dir}/scripts"])
invoke(["cp", "-rf", f"{ROOT_DIR}/README.md", f"{work_dir}/README.md"])

# install sw at current session
print(f"env PYPI_RELEASE_VERSION is:{os.environ.get('PYPI_RELEASE_VERSION')}")
invoke(["pip", "install", "-e", f"{_work_dir}/client"])
invoke(["python3", "-m", "pip", "install", "-e", f"{work_dir}/client"])
_res, _err = invoke(["swcli", "--version"])
print(f"pytest use swcli version is:{_res}")
return _work_dir


class RunMode:
Expand All @@ -207,11 +217,15 @@ class RunMode:


if __name__ == "__main__":
# start test
test_cli = TestCli(work_dir=init_run_environment())
example = sys.argv[1]
_mode = RunMode.CLOUD if os.environ.get("CONTROLLER_URL") else RunMode.STANDALONE
if example == "mnist":
test_cli.test_mnist(_mode)
else:
print("there is nothing to run!")
with tempfile.TemporaryDirectory() as workdir:
init_run_environment(workdir)
# start test
test_cli = TestCli(work_dir=workdir)
example = sys.argv[1]
_cloud_url = os.environ.get("CONTROLLER_URL")
if example == "mnist":
test_cli.test_mnist(_cloud_url)
elif example == "simple":
test_cli.test_simple(_cloud_url)
else:
print("there is nothing to run!")
10 changes: 5 additions & 5 deletions scripts/client_test/cmds/base/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .invoke import invoke
from .invoke import invoke, invoke_with_react


class EnvironmentPrepare:
Expand Down Expand Up @@ -52,28 +52,28 @@ def prepare_mnist_data(self) -> None:
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
]
)
invoke(
invoke_with_react(
[
"gzip",
"-d",
f"{self.work_dir}/example/mnist/data/train-images-idx3-ubyte.gz",
]
)
invoke(
invoke_with_react(
[
"gzip",
"-d",
f"{self.work_dir}/example/mnist/data/train-labels-idx1-ubyte.gz",
]
)
invoke(
invoke_with_react(
[
"gzip",
"-d",
f"{self.work_dir}/example/mnist/data/t10k-images-idx3-ubyte.gz",
]
)
invoke(
invoke_with_react(
[
"gzip",
"-d",
Expand Down
2 changes: 1 addition & 1 deletion scripts/client_test/cmds/eval_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def run(

_args.extend(["--project", project])
_res, _err = invoke(_args)
return not _err and _valid_str in _res
return _valid_str in _res

def info(self, version: str) -> Any:
"""
Expand Down
8 changes: 4 additions & 4 deletions scripts/client_test/step_spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ default:
overwriteable: true
resources:
- type: cpu
request: 0.2
limit: 0.2
request: 0.4
limit: 0.4
step_name: ppl
task_num: 2
- cls_name: ''
Expand All @@ -18,7 +18,7 @@ default:
overwriteable: false
resources:
- type: cpu
request: 0.2
limit: 0.2
request: 0.4
limit: 0.4
step_name: cmp
task_num: 1
12 changes: 12 additions & 0 deletions scripts/e2e_test/check_controller_port.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

var=$(lsof -i tcp:8082)
if [ ! -z "$var" ]
then
sleep 10
else
if kill -9 `ps -ef|grep port-forward | grep -v grep | awk '{print $2}'` ; then echo "kill success and restart port-forward"; fi
nohup kubectl port-forward --namespace starwhale-e2e svc/starwhale-e2e-controller 8082:8082 &
sleep 10
fi

58 changes: 58 additions & 0 deletions scripts/e2e_test/check_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env bash

set -e

if [[ ! -z ${DEBUG} ]]; then
set -x
fi
echo "login"
curl -D - --location --request POST "http://$1/api/v1/login" \
--header 'Accept: application/json' \
--form 'userName="starwhale"' \
--form 'userPwd="abcd1234"' -o /dev/null | while read line
do
if [[ "${line}" =~ ^Authorization.* ]] ; then
echo "${line}" > auth_header.h
fi
done

auth_header=`cat auth_header.h`

sudo apt-get install jq

echo "get task"
OUT=`curl -X 'GET' \
"http://$1/api/v1/project/starwhale/job/1/task?pageNum=1&pageSize=10" \
-H 'accept: application/json' \
-H "$auth_header" | jq '.data.list'| jq -r '.[]|.id'`
echo "taskids: $OUT..."
read -a task_ids <<< $OUT
task_ids=($OUT)

SAVEIFS=$IFS # Save current IFS (Internal Field Separator)
IFS=$'\n' # Change IFS to newline char
task_ids=($OUT) # split the `names` string into an array by the same name
IFS=$SAVEIFS # Restore original IFS
echo "get logs..."
for (( i=0; i<${#task_ids[@]}; i++ ))
do
task_id=${task_ids[$i]}
log_file=`curl -X 'GET' "http://$1/api/v1/log/offline/$task_id" -H 'accept: application/json' -H "$auth_header" | jq -r '.data[0]'`
echo $log_file

echo "task log is:"
curl -X 'GET' "http://$1/api/v1/log/offline/$task_id/$log_file" -H 'accept: plain/text' -H "$auth_header"
done

echo "get job status"

curl -X 'GET' \
"http://$1/api/v1/project/starwhale/job/1" \
-H 'accept: application/json' \
-H "$auth_header" | jq -r '.data.jobStatus' > jobStatus
job_status=`cat jobStatus`

echo "job status is $job_status"
if [[ "$job_status" != "SUCCESS" ]] ; then
exit 1
fi
Empty file modified scripts/e2e_test/service_wait.sh
100644 → 100755
Empty file.
5 changes: 4 additions & 1 deletion scripts/e2e_test/start_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ check_controller_service() {
sleep 15
done
nohup kubectl port-forward --namespace $SWNS svc/$SWNAME-controller 8082:$PORT_CONTROLLER &
pwd
check_controller_port.sh &
}

client_test() {
Expand All @@ -211,7 +213,8 @@ client_test() {
rm -rf .pytest_cache
rm -rf venv*
pushd ../
scripts/run_demo.sh
python3 scripts/client_test/cli_test.py simple
scripts/e2e_test/check_job.sh 127.0.0.1:$PORT_CONTROLLER
popd
popd

Expand Down
Loading

0 comments on commit e797feb

Please sign in to comment.