diff --git a/docker/Dockerfile.base_e2e b/docker/Dockerfile.base_e2e index af6c6a42de..e2d4ddd318 100644 --- a/docker/Dockerfile.base_e2e +++ b/docker/Dockerfile.base_e2e @@ -40,7 +40,7 @@ RUN apt-get install -y openjdk-11-jdk \ # Install python RUN add-apt-repository -y ppa:deadsnakes/ppa \ && apt-get update \ - && apt-get install -y python3.7 python3.8 python3.9 python3.10 python3-pip python3-venv python3.8-venv python3.7-venv python3.9-venv python3.10-venv \ + && apt-get install -y python3.7 python3.7-dev python3.8 python3.8-dev python3.9 python3.9-dev python3.10 python3.10-dev python3-pip python3-venv python3.8-venv python3.7-venv python3.9-venv python3.10-venv \ && apt-get clean all \ && rm -rf /var/lib/apt/lists/* /tmp/* diff --git a/docker/charts/templates/controller-deployment.yaml b/docker/charts/templates/controller-deployment.yaml index 077d1e32e5..2d21b10c4a 100644 --- a/docker/charts/templates/controller-deployment.yaml +++ b/docker/charts/templates/controller-deployment.yaml @@ -67,7 +67,7 @@ spec: value: controller {{- if not .Values.minikube.enabled }} - name: JVM_XMX - value: {{ .Values.resources.controller.requests.memory }} + value: {{ .Values.resources.controller.limits.memory }} {{- end }} - name: SW_CONTROLLER_PORT value: "{{ .Values.controller.containerPort }}" diff --git a/example/runtime/pytorch-cn-mirror/requirements.txt b/example/runtime/pytorch-cn-mirror/requirements.txt index 70aab3a2ca..81c2ba6bf0 100644 --- a/example/runtime/pytorch-cn-mirror/requirements.txt +++ b/example/runtime/pytorch-cn-mirror/requirements.txt @@ -9,19 +9,14 @@ pycocotools opencv-python-headless # fixed versions numpy==1.21.6; python_version=='3.7' -torch==1.11.0; python_version=='3.7' -torchaudio==0.11.0; python_version=='3.7' -torchdata==0.3.0; python_version=='3.7' -torchtext==0.12.0; python_version=='3.7' -torchvision==0.12.0; python_version=='3.7' scikit-learn==1.0.2; python_version=='3.7' scipy==1.7.3; python_version=='3.7' numpy==1.23.2; python_version>='3.8' -torch==1.12.1; python_version>='3.8' scikit-learn==1.1.1; python_version>='3.8' scipy==1.8.1; python_version>='3.8' importlib-metadata>=4.0.0, <=4.2.0;python_version < '3.8' -torchaudio==0.12.1; python_version>='3.8' -torchdata==0.4.1; python_version>='3.8' -torchtext==0.13.1; python_version>='3.8' -torchvision==0.13.1; python_version>='3.8' +torch==1.12.1 +torchaudio==0.12.1 +torchdata==0.4.1 +torchtext==0.13.1 +torchvision==0.13.1 diff --git a/example/runtime/pytorch-e2e/requirements-sw-lock.txt b/example/runtime/pytorch-e2e/requirements-sw-lock.txt deleted file mode 100644 index b7d29768c1..0000000000 --- a/example/runtime/pytorch-e2e/requirements-sw-lock.txt +++ /dev/null @@ -1,62 +0,0 @@ -appdirs==1.4.4 -attrs==21.4.0 -boto3==1.21.0 -botocore==1.24.46 -cattrs==1.7.1 -certifi==2022.6.15 -charset-normalizer==2.1.0 -click==8.1.3 -commonmark==0.9.1 -conda-pack==0.6.0 -contourpy==1.0.6 -cycler==0.11.0 -dill==0.3.5.1 -distlib==0.3.5 -filelock==3.7.1 -fonttools==4.37.1 -fs==2.4.16 -idna==3.3 -Jinja2==3.1.2 -jmespath==0.10.0 -joblib==1.1.0 -jsonlines==3.0.0 -kiwisolver==1.4.4 -loguru==0.6.0 -MarkupSafe==2.1.1 -matplotlib==3.5.3 -numpy==1.23.2 -nvidia-cublas-cu11==11.10.3.66 -nvidia-cuda-nvrtc-cu11==11.7.99 -nvidia-cuda-runtime-cu11==11.7.99 -nvidia-cudnn-cu11==8.5.0.96 -packaging==21.3 -Pillow==9.2.0 -platformdirs==2.5.2 -portalocker==2.5.1 -pyarrow==9.0.0 -pycocotools==2.0.4 -Pygments==2.12.0 -pyparsing==3.0.9 -python-dateutil==2.8.2 -PyYAML==6.0 -requests==2.28.1 -requests-toolbelt==0.9.1 -rich==12.5.1 -s3transfer==0.5.2 -scikit-learn==1.1.1 -scipy==1.8.1 -shellingham==1.5.0 -six==1.16.0 -tenacity==8.0.1 -textual==0.1.18 -threadpoolctl==3.1.0 -torch==1.12.1 -torchaudio==0.12.1 -torchdata==0.4.1 -torchtext==0.13.1 -torchvision==0.13.1 -tqdm==4.64.0 -typing_extensions==4.3.0 -urllib3==1.26.10 -virtualenv==20.15.1 -zipp==3.8.1 diff --git a/example/runtime/pytorch-e2e/requirements.txt b/example/runtime/pytorch-e2e/requirements.txt new file mode 100755 index 0000000000..892de78bb0 --- /dev/null +++ b/example/runtime/pytorch-e2e/requirements.txt @@ -0,0 +1,17 @@ +# for pfp +pycocotools +# for ucf101 +opencv-python-headless +# fixed versions +numpy==1.21.6; python_version=='3.7' +scikit-learn==1.0.2; python_version=='3.7' +scipy==1.7.3; python_version=='3.7' +numpy==1.23.2; python_version>='3.8' +scikit-learn==1.1.1; python_version>='3.8' +scipy==1.8.1; python_version>='3.8' +importlib-metadata>=4.0.0, <=4.2.0;python_version < '3.8' +torch==1.12.1 +torchaudio==0.12.1 +torchdata==0.4.1 +torchtext==0.13.1 +torchvision==0.13.1 diff --git a/example/runtime/pytorch-e2e/runtime-3-10.yaml b/example/runtime/pytorch-e2e/runtime-3-10.yaml new file mode 100755 index 0000000000..99b1e1184c --- /dev/null +++ b/example/runtime/pytorch-e2e/runtime-3-10.yaml @@ -0,0 +1,19 @@ +api_version: 1.1 +dependencies: + - pip: + - Pillow + - numpy + - scikit-learn + - torchvision + - torch + - torchdata + - torchtext + - torchaudio + - pycocotools + - requirements.txt +environment: + python: "3.10" + arch: noarch + os: ubuntu:20.04 +mode: venv +name: pytorch diff --git a/example/runtime/pytorch-e2e/runtime-3-7.yaml b/example/runtime/pytorch-e2e/runtime-3-7.yaml new file mode 100755 index 0000000000..da5cbaf99e --- /dev/null +++ b/example/runtime/pytorch-e2e/runtime-3-7.yaml @@ -0,0 +1,19 @@ +api_version: 1.1 +dependencies: + - pip: + - Pillow + - numpy + - scikit-learn + - torchvision + - torch + - torchdata + - torchtext + - torchaudio + - pycocotools + - requirements.txt +environment: + arch: noarch + os: ubuntu:20.04 + python: 3.7 +mode: venv +name: pytorch diff --git a/example/runtime/pytorch-e2e/runtime-3-8.yaml b/example/runtime/pytorch-e2e/runtime-3-8.yaml new file mode 100755 index 0000000000..e02c04f7ae --- /dev/null +++ b/example/runtime/pytorch-e2e/runtime-3-8.yaml @@ -0,0 +1,19 @@ +api_version: 1.1 +dependencies: + - pip: + - Pillow + - numpy + - scikit-learn + - torchvision + - torch + - torchdata + - torchtext + - torchaudio + - pycocotools + - requirements.txt +environment: + arch: noarch + os: ubuntu:20.04 + python: 3.8 +mode: venv +name: pytorch diff --git a/example/runtime/pytorch-e2e/runtime-3-9.yaml b/example/runtime/pytorch-e2e/runtime-3-9.yaml new file mode 100755 index 0000000000..7aa35e0795 --- /dev/null +++ b/example/runtime/pytorch-e2e/runtime-3-9.yaml @@ -0,0 +1,19 @@ +api_version: 1.1 +dependencies: + - pip: + - Pillow + - numpy + - scikit-learn + - torchvision + - torch + - torchdata + - torchtext + - torchaudio + - pycocotools + - requirements.txt +environment: + python: 3.9 + arch: noarch + os: ubuntu:20.04 +mode: venv +name: pytorch diff --git a/example/runtime/pytorch-e2e/runtime.yaml b/example/runtime/pytorch-e2e/runtime.yaml index 9a77a2ebcc..a206f8bebb 100644 --- a/example/runtime/pytorch-e2e/runtime.yaml +++ b/example/runtime/pytorch-e2e/runtime.yaml @@ -16,7 +16,7 @@ dependencies: - torchtext - torchaudio - pycocotools - - requirements-sw-lock.txt + - requirements.txt environment: arch: noarch os: ubuntu:20.04 diff --git a/example/runtime/pytorch/requirements.txt b/example/runtime/pytorch/requirements.txt index 9e375bd97f..5b4ce04dbe 100644 --- a/example/runtime/pytorch/requirements.txt +++ b/example/runtime/pytorch/requirements.txt @@ -5,19 +5,14 @@ pycocotools opencv-python-headless # fixed versions numpy==1.21.6; python_version=='3.7' -torch==1.11.0; python_version=='3.7' -torchaudio==0.11.0; python_version=='3.7' -torchdata==0.3.0; python_version=='3.7' -torchtext==0.12.0; python_version=='3.7' -torchvision==0.12.0; python_version=='3.7' scikit-learn==1.0.2; python_version=='3.7' scipy==1.7.3; python_version=='3.7' numpy==1.23.2; python_version>='3.8' -torch==1.12.1; python_version>='3.8' scikit-learn==1.1.1; python_version>='3.8' scipy==1.8.1; python_version>='3.8' importlib-metadata>=4.0.0, <=4.2.0;python_version < '3.8' -torchaudio==0.12.1; python_version>='3.8' -torchdata==0.4.1; python_version>='3.8' -torchtext==0.13.1; python_version>='3.8' -torchvision==0.13.1; python_version>='3.8' +torch==1.12.1 +torchaudio==0.12.1 +torchdata==0.4.1 +torchtext==0.13.1 +torchvision==0.13.1 diff --git a/example/ucf101/requirements-sw-lock.txt b/example/ucf101/requirements-sw-lock.txt deleted file mode 100644 index 0e4b793b40..0000000000 --- a/example/ucf101/requirements-sw-lock.txt +++ /dev/null @@ -1,51 +0,0 @@ -# Generated by Starwhale(0.0.0.dev0) Runtime Lock -appdirs==1.4.4 -attrs==21.4.0 -boto3==1.21.0 -botocore==1.24.46 -cattrs==1.7.1 -certifi==2022.9.24 -charset-normalizer==2.1.1 -click==8.1.3 -click-option-group==0.5.5 -commonmark==0.9.1 -conda-pack==0.6.0 -dill==0.3.5.1 -distlib==0.3.6 -filelock==3.8.0 -fs==2.4.16 -idna==3.4 -Jinja2==3.1.2 -jmespath==0.10.0 -joblib==1.2.0 -jsonlines==3.0.0 -loguru==0.6.0 -MarkupSafe==2.1.1 -numpy==1.23.4 -nvidia-cublas-cu11==11.10.3.66 -nvidia-cuda-nvrtc-cu11==11.7.99 -nvidia-cuda-runtime-cu11==11.7.99 -nvidia-cudnn-cu11==8.5.0.96 -opencv-python-headless==4.6.0.66 -packaging==21.3 -platformdirs==2.5.3 -pyarrow==10.0.0 -Pygments==2.13.0 -pyparsing==3.0.9 -python-dateutil==2.8.2 -PyYAML==6.0 -requests==2.28.1 -requests-toolbelt==0.10.1 -rich==12.6.0 -s3transfer==0.5.2 -scikit-learn==1.1.3 -scipy==1.9.3 -shellingham==1.5.0 -six==1.16.0 -tenacity==8.1.0 -textual==0.1.18 -threadpoolctl==3.1.0 -torch==1.13.0 -typing_extensions==4.4.0 -urllib3==1.26.12 -virtualenv==20.16.6 diff --git a/example/ucf101/runtime.yaml b/example/ucf101/runtime.yaml deleted file mode 100644 index b63356fe2b..0000000000 --- a/example/ucf101/runtime.yaml +++ /dev/null @@ -1,8 +0,0 @@ -api_version: '1.1' -dependencies: -- requirements-sw-lock.txt -environment: - arch: noarch - os: ubuntu:20.04 -mode: venv -name: ucf101 diff --git a/scripts/client_test/cli_test.py b/scripts/client_test/cli_test.py index cc6ed6199f..e5643583eb 100644 --- a/scripts/client_test/cli_test.py +++ b/scripts/client_test/cli_test.py @@ -1,5 +1,6 @@ import os import sys +import shutil import typing as t import logging import subprocess @@ -68,12 +69,15 @@ "datasets": [""], }, } -RUNTIMES: t.Dict[str, t.Dict[str, str]] = { +RUNTIMES: t.Dict[str, t.Dict[str, t.Union[str, t.List[str]]]] = { "pytorch": { "workdir": f"{WORK_DIR}/example/runtime/pytorch-e2e", - }, - "ucf101": { - "workdir": f"{WORK_DIR}/example/ucf101", + "yamls": [ + "runtime-3-7.yaml", + "runtime-3-8.yaml", + "runtime-3-9.yaml", + "runtime-3-10.yaml", + ], }, } @@ -98,7 +102,7 @@ def __init__( self.server_url = server_url self.server_project = server_project self.datasets: t.Dict[str, t.List[URI]] = {} - self.runtimes: t.Dict[str, URI] = {} + self.runtimes: t.Dict[str, t.List[URI]] = {} self.models: t.Dict[str, URI] = {} if self.server_url: logger.info(f"login to server {self.server_url} ...") @@ -156,16 +160,22 @@ def build_model( def build_runtime( self, _workdir: str, + runtime_yaml: str = "runtime.yaml", ) -> t.Any: self.select_local_instance() - _uri = Runtime.build_with_api(workdir=_workdir) + runtime_cache_path = f"{_workdir}/.starwhale" + if os.path.exists(runtime_cache_path): + shutil.rmtree(runtime_cache_path) + _uri = Runtime.build_with_api(workdir=_workdir, runtime_yaml=runtime_yaml) if self.server_url: assert self.runtime_api.copy( src_uri=_uri.full_uri, target_project=f"cloud://server/project/{self.server_project}", force=True, ) - self.runtimes.update({_uri.object.name: _uri}) + rts = self.runtimes.get(_uri.object.name, []) + rts.append(_uri) + self.runtimes.update({_uri.object.name: rts}) assert len(self.runtime_api.list()) assert self.runtime_api.info(_uri.full_uri) return _uri @@ -177,55 +187,70 @@ def select_local_instance(self) -> None: def eval( self, _model_uri: URI, - _rt_uri: URI, + _rt_uris: t.List[URI], _ds_uris: t.List[URI], step_spec_file: str, local_instance: bool = True, - ) -> Future: - if local_instance: - _jid = self.local_evl(_ds_uris, _model_uri, _rt_uri) - return executor.submit(lambda: (_jid, next(iter(STATUS_SUCCESS)))) - if self.server_url and not local_instance: - return self.remote_eval(_ds_uris, _model_uri, _rt_uri, step_spec_file) - return executor.submit(lambda: ("", next(iter(STATUS_SUCCESS)))) + ) -> t.List[Future]: + if not local_instance and self.server_url: + return self.remote_eval(_ds_uris, _model_uri, _rt_uris, step_spec_file) + else: + self.local_evl(_ds_uris, _model_uri, _rt_uris) + return [] def local_evl( - self, _ds_uris: t.List[URI], _model_uri: URI, _rt_uri: t.Optional[URI] = None + self, + _ds_uris: t.List[URI], + _model_uri: URI, + _rt_uris: t.Optional[t.List[URI]] = None, ) -> t.Any: logger.info("running evaluation at local...") self.select_local_instance() - _job_id = self.evaluation_api.run( - model=_model_uri.full_uri, - datasets=[_ds_uri.full_uri for _ds_uri in _ds_uris], - runtime=_rt_uri.full_uri if _rt_uri else "", - ) - assert _job_id - assert len(self.evaluation_api.list()) - eval_info = self.evaluation_api.info(_job_id) - assert eval_info - assert eval_info["manifest"]["status"] in STATUS_SUCCESS - logger.info("finish run evaluation at standalone.") - return _job_id + jids = [] + if not _rt_uris: + _rt_uris = [URI("")] + for _rt_uri in _rt_uris: + _job_id = self.evaluation_api.run( + model=_model_uri.full_uri, + datasets=[_ds_uri.full_uri for _ds_uri in _ds_uris], + runtime=_rt_uri.full_uri if _rt_uri.raw else "", + ) + assert _job_id + assert len(self.evaluation_api.list()) + eval_info = self.evaluation_api.info(_job_id) + assert eval_info + assert eval_info["manifest"]["status"] in STATUS_SUCCESS + logger.info("finish run evaluation at standalone.") + jids.append(_job_id) + return jids def remote_eval( - self, _ds_uris: t.List[URI], _model_uri: URI, _rt_uri: URI, step_spec_file: str - ) -> Future: + self, + _ds_uris: t.List[URI], + _model_uri: URI, + _rt_uris: t.List[URI], + step_spec_file: str, + ) -> t.List[Future]: self.instance_api.select(instance="server") self.project_api.select(project=self.server_project) # 8.start an evaluation - logger.info("running evaluation at server...") - _remote_jid = self.evaluation_api.run( - model=_model_uri.object.version, - datasets=[_ds_uri.object.version for _ds_uri in _ds_uris], - runtime=_rt_uri.object.version, - project=f"{self.server_url}/project/{self.server_project}", - step_spec=step_spec_file, - resource_pool=os.environ.get("RESOURCE_POOL"), - ) - assert _remote_jid - # 9.check job's status - _js = executor.submit(self.get_remote_job_status, _remote_jid) - return _js + job_status_checkers = [] + for _rt_uri in _rt_uris: + logger.info("running evaluation at server...") + _remote_jid = self.evaluation_api.run( + model=_model_uri.object.version, + datasets=[_ds_uri.object.version for _ds_uri in _ds_uris], + runtime=_rt_uri.object.version, + project=f"{self.server_url}/project/{self.server_project}", + step_spec=step_spec_file, + resource_pool=os.environ.get("RESOURCE_POOL"), + ) + assert _remote_jid + # 9.check job's status + job_status_checkers.append( + executor.submit(self.get_remote_job_status, _remote_jid) + ) + return job_status_checkers def get_remote_job_status(self, job_id: str) -> t.Tuple[str, str]: while True: @@ -262,17 +287,15 @@ def test_simple(self) -> None: self.local_evl([_ds_uri], _model_uri) if self.server_url: - _js = self.remote_eval( - [_ds_uri], _model_uri, _rt_uri, step_spec_f("step_spec_cpu_mini.yaml") + _jss = self.remote_eval( + [_ds_uri], _model_uri, [_rt_uri], step_spec_f("step_spec_cpu_mini.yaml") ) - _, status = _js.result() - assert status in STATUS_SUCCESS + for _js in _jss: + jid, status = _js.result() + assert status in STATUS_SUCCESS def test_all(self) -> None: - for name, rt in RUNTIMES.items(): - self.build_runtime(rt["workdir"]) - for name, expl in EXAMPLES.items(): logger.info(f"preparing data for {expl}") rc = subprocess.call( @@ -289,12 +312,12 @@ def test_all(self) -> None: self.build_dataset(workdir_, d_type) self.build_model(workdir_) - # run evals on standalone - for name, expl in EXAMPLES.items(): - expl.get("device", "cpu") == "cpu" and self.run_example( - name, - step_spec_f("step_spec_cpu_full.yaml"), - ) + for name, rt in RUNTIMES.items(): + if "yamls" not in rt: + self.build_runtime(str(rt["workdir"])) + else: + for yml in list(rt["yamls"]): + self.build_runtime(str(rt["workdir"]), yml) # run evals on server res = [ @@ -305,7 +328,16 @@ def test_all(self) -> None: ) for name, expl in EXAMPLES.items() ] - for _js in res: + status_checkers: t.List[Future] = sum(res, []) + + # run evals on standalone + for name, expl in EXAMPLES.items(): + expl.get("device", "cpu") == "cpu" and self.run_example( + name, + step_spec_f("step_spec_cpu_full.yaml"), + ) + + for _js in status_checkers: jid, status = _js.result() if status not in STATUS_SUCCESS: logger.error(f"job {jid} failed!") @@ -315,19 +347,25 @@ def test_expl(self, expl_name: str) -> None: rt_ = RUNTIMES.get(expl_name) or RUNTIMES.get("pytorch") if not rt_: raise RuntimeError(f"no runtime matching for {expl_name}") - self.build_runtime(str(rt_.get("workdir"))) + for name, rt in RUNTIMES.items(): + if "yamls" not in rt: + self.build_runtime(str(rt["workdir"])) + else: + for yml in list(rt["yamls"]): + self.build_runtime(str(rt["workdir"]), yml) expl = EXAMPLES[expl_name] workdir_ = str(expl["workdir"]) - p = subprocess.Popen( - ["make", "prepare"], + rc = subprocess.Popen( + ["make", "CN=1", "prepare"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=workdir_, ) - - p.wait() + if rc != 0: + logger.error(f"prepare data for {expl} failed") + raise # download data for d_type in expl["datasets"]: @@ -335,29 +373,25 @@ def test_expl(self, expl_name: str) -> None: self.build_model(workdir_) # run_eval - _js = self.run_example( + self.run_example( expl_name, step_spec_f(f"step_spec_{expl.get('device', 'cpu')}_full.yaml"), expl.get("device", "cpu") == "cpu", ) - jid, status = _js.result() - if status not in STATUS_SUCCESS: - logger.error(f"job {jid} failed!") - exit(1) def run_example( self, name: str, step_spec: str, local_instance: bool = True - ) -> Future: + ) -> t.List[Future]: datasets_ = self.datasets.get(name) if not datasets_: raise RuntimeError("datasets should not be empty") model_ = self.models.get(name) if not model_: raise RuntimeError("model should not be empty") - runtime_ = self.runtimes.get(name) or self.runtimes.get("pytorch") - if not runtime_: - raise RuntimeError("runtime should not be empty") - return self.eval(model_, runtime_, datasets_, step_spec, local_instance) + runtimes_ = self.runtimes.get(name) or self.runtimes.get("pytorch") + if not runtimes_: + raise RuntimeError("runtimes should not be empty") + return self.eval(model_, runtimes_, datasets_, step_spec, local_instance) def debug(self) -> None: for name, expl in EXAMPLES.items(): diff --git a/scripts/e2e_test/docker/Dockerfile.e2e b/scripts/e2e_test/docker/Dockerfile.e2e index ad25762157..16b078f8e3 100644 --- a/scripts/e2e_test/docker/Dockerfile.e2e +++ b/scripts/e2e_test/docker/Dockerfile.e2e @@ -1,3 +1,3 @@ -FROM homepage-ca.intra.starwhale.ai:5000/sw-e2e-base:0.8 +FROM homepage-ca.intra.starwhale.ai:5000/sw-e2e-base:latest COPY entrypoint.sh / ENTRYPOINT ["/entrypoint.sh"] diff --git a/scripts/e2e_test/docker/entrypoint.sh b/scripts/e2e_test/docker/entrypoint.sh index b7b2c75566..6632b2ac31 100755 --- a/scripts/e2e_test/docker/entrypoint.sh +++ b/scripts/e2e_test/docker/entrypoint.sh @@ -32,12 +32,16 @@ if ! test -d /starwhale; then fi git config --global --add safe.directory /starwhale git config --global user.email "renyanda@starwhale.ai" + +python3 -m pip install --upgrade pip +python3 -m pip config set global.cache-dir /.cache/pip +python3 -m pip config set global.default-timeout 300 cd /starwhale/scripts/e2e_test if [[ -z "$PUBLISH" ]] ; then if bash start_test.sh ;then - send_feishu "e2e SUCCESS ns:$SWNS name:$SWNAME" + send_feishu "e2e SUCCESS: console path http://$SWNAME.pre.intra.starwhale.ai" else - send_feishu "e2e FAIL ns:$SWNS name:$SWNAME" + send_feishu "e2e FAIL: ns:$SWNS error log is: $LOG_STORAGE/log/log.$(date +'%Y%m%d')" fi else cd /starwhale/scripts/publish diff --git a/scripts/publish/pub.sh b/scripts/publish/pub.sh index 55eb5786d5..b86055f0ae 100644 --- a/scripts/publish/pub.sh +++ b/scripts/publish/pub.sh @@ -102,6 +102,7 @@ deploy() { --set minio.image=docker-registry.starwhale.cn/bitnami/minio:2022.6.20-debian-11-r0 \ --set devMode.createPV.enabled=true \ --set devMode.createPV.host=host005-bj01 \ + --set resources.controller.limits.memory=16G \ --set devMode.createPV.rootPath=/mnt/data/starwhale/$SWNS/$SWNAME \ --set minio.ingress.host=${SWNAME//./-}-minio.pre.intra.starwhale.ai \ --set controller.ingress.host=${SWNAME//./-}.pre.intra.starwhale.ai diff --git a/server/controller/src/main/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetter.java b/server/controller/src/main/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetter.java index 32a4ab3a87..deac0e4883 100644 --- a/server/controller/src/main/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetter.java +++ b/server/controller/src/main/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetter.java @@ -23,7 +23,6 @@ import ai.starwhale.mlops.exception.SwValidationException; import ai.starwhale.mlops.exception.SwValidationException.ValidSubject; import ai.starwhale.mlops.storage.StorageAccessService; -import ai.starwhale.mlops.storage.StorageObjectInfo; import ai.starwhale.mlops.storage.StorageUri; import java.io.IOException; import java.io.InputStream; @@ -85,7 +84,7 @@ public byte[] dataOf(Long datasetId, String uri, Long offset, } StorageAccessService storageAccessService = storageAccessParser.getStorageAccessServiceFromUri(getStorageUri(uri)); - String path = checkPath(datasetId, uri, storageAccessService); + String path = checkPath(datasetId, storageUri); try (InputStream inputStream = validParam(size, offset) ? storageAccessService.get(path, offset, size) : storageAccessService.get(path)) { return inputStream.readAllBytes(); @@ -102,7 +101,7 @@ public String linkOf(Long datasetId, String uri, Long expTimeMillis) { return uri; } StorageAccessService storageAccessService = storageAccessParser.getStorageAccessServiceFromUri(storageUri); - String path = checkPath(datasetId, uri, storageAccessService); + String path = checkPath(datasetId, storageUri); try { return storageAccessService.signedUrl(path, expTimeMillis); } catch (IOException e) { @@ -110,25 +109,13 @@ public String linkOf(Long datasetId, String uri, Long expTimeMillis) { } } - private String checkPath(Long datasetId, String uri, StorageAccessService storageAccessService) { - String path; - try { - path = new StorageUri(uri).getPathAfterBucket(); - } catch (URISyntaxException e) { - log.error("malformed uri {}", uri, e); - throw new SwValidationException(ValidSubject.DATASET, "malformed uri"); - } - StorageObjectInfo objectInfo; - try { - objectInfo = storageAccessService.head(path); - } catch (IOException e) { - throw new SwProcessException(ErrorType.STORAGE, "error while accessing storage", e); - } - if (!objectInfo.isExists()) { - DatasetVersionEntity versionById = datasetVersionMapper.find(datasetId); - path = StringUtils.trimTrailingCharacter(versionById.getStoragePath(), '/') + "/" - + StringUtils.trimLeadingCharacter(path, '/'); + private String checkPath(Long datasetId, StorageUri uri) { + String path = uri.getPathAfterBucket(); + if (StringUtils.hasText(uri.getSchema())) { + return path; } - return path; + DatasetVersionEntity versionById = datasetVersionMapper.find(datasetId); + return StringUtils.trimTrailingCharacter(versionById.getStoragePath(), '/') + "/" + + StringUtils.trimLeadingCharacter(path, '/'); } } diff --git a/server/controller/src/main/java/ai/starwhale/mlops/domain/job/JobService.java b/server/controller/src/main/java/ai/starwhale/mlops/domain/job/JobService.java index 337eae9c2a..63050375e5 100644 --- a/server/controller/src/main/java/ai/starwhale/mlops/domain/job/JobService.java +++ b/server/controller/src/main/java/ai/starwhale/mlops/domain/job/JobService.java @@ -157,6 +157,7 @@ public Boolean recoverJob(String projectUrl, String jobUrl) { throw new UnsupportedOperationException("Please use TrashService.recover() instead."); } + @Transactional public Long createJob(String projectUrl, String modelVersionUrl, String datasetVersionUrls, String runtimeVersionUrl, String comment, String resourcePool, diff --git a/server/controller/src/main/java/ai/starwhale/mlops/schedule/k8s/JobEventHandler.java b/server/controller/src/main/java/ai/starwhale/mlops/schedule/k8s/JobEventHandler.java index 765c2a2d7f..81944ab690 100644 --- a/server/controller/src/main/java/ai/starwhale/mlops/schedule/k8s/JobEventHandler.java +++ b/server/controller/src/main/java/ai/starwhale/mlops/schedule/k8s/JobEventHandler.java @@ -67,13 +67,16 @@ private TaskStatus statusOf(V1Job newObj) { //one task one k8s job if (null != status.getFailed()) { taskStatus = TaskStatus.FAIL; - log.debug("job status changed for {} is failed {}", jobName(newObj), status); + log.error("job status changed for {} is failed {}", jobName(newObj), status); + String spec = null != newObj.getSpec() ? newObj.getSpec().toString() : null; + String metadata = null != newObj.getMetadata() ? newObj.getMetadata().toString() : null; + log.error("job failed with spec:\n{} \njob failed with metadata:\n{}", spec, metadata); } else if (null != status.getActive()) { taskStatus = TaskStatus.RUNNING; - log.debug("job status changed for {} is running {}", jobName(newObj), status); + log.info("job status changed for {} is running {}", jobName(newObj), status); } else if (null != status.getSucceeded()) { taskStatus = TaskStatus.SUCCESS; - log.debug("job status changed for {} is success {}", jobName(newObj), status); + log.info("job status changed for {} is success {}", jobName(newObj), status); } else { taskStatus = TaskStatus.UNKNOWN; log.warn("job status changed for {} is unknown {}", jobName(newObj), status); diff --git a/server/controller/src/main/resources/application.yaml b/server/controller/src/main/resources/application.yaml index b89bf2ae7b..02418dfb3a 100644 --- a/server/controller/src/main/resources/application.yaml +++ b/server/controller/src/main/resources/application.yaml @@ -41,7 +41,7 @@ sw: event-holder-ttl-in-seconds: ${SW_K8S_EVENT_HOLDER_TTL_IN_SECS:43200} # 12h job: template-path: ${SW_K8S_JOB_TEMPLATE_PATH:} - restart-policy: ${SW_K8S_JOB_RESTART_POLICY:OnFailure} + restart-policy: ${SW_K8S_JOB_RESTART_POLICY:Never} backoff-limit: ${SW_K8S_JOB_BACKOFF_LIMIT:10} storage: type: ${SW_STORAGE_TYPE:minio} diff --git a/server/controller/src/test/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetterTest.java b/server/controller/src/test/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetterTest.java index f2ef3dd186..2b2ab3fec8 100644 --- a/server/controller/src/test/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetterTest.java +++ b/server/controller/src/test/java/ai/starwhale/mlops/domain/dataset/objectstore/DsFileGetterTest.java @@ -78,17 +78,15 @@ public void testLinkOf() throws IOException { StorageAccessParser storageAccessParser = mock(StorageAccessParser.class); StorageAccessService storageAccessService = mock( StorageAccessService.class); - when(storageAccessService.head("/bdcsd")).thenReturn(new StorageObjectInfo(false, 1L, null)); - when(storageAccessService.head("bdc/bdcsd")).thenReturn(new StorageObjectInfo(true, 1L, null)); - when(storageAccessService.signedUrl(eq("bdc/bdcsd"), anyLong())).thenReturn("abc"); + when(storageAccessService.signedUrl(eq("/bdc/bdcsd"), anyLong())).thenReturn("abc"); when(storageAccessParser.getStorageAccessServiceFromUri(any())).thenReturn( storageAccessService); DatasetVersionMapper versionMapper = mock(DatasetVersionMapper.class); when(versionMapper.find(anyLong())).thenReturn( - DatasetVersionEntity.builder().storagePath("bdc").build()); + DatasetVersionEntity.builder().storagePath("/bdc").build()); DsFileGetter fileGetter = new DsFileGetter(storageAccessParser, versionMapper); Assertions.assertEquals("abc", fileGetter.linkOf(1L, "/bdcsd", 1L)); - Assertions.assertEquals("abc", fileGetter.linkOf(1L, "bdc/bdcsd", 1L)); + Assertions.assertEquals("abc", fileGetter.linkOf(1L, "s3://host:9080/bucket/bdc/bdcsd", 1L)); } } \ No newline at end of file