Skip to content
This repository was archived by the owner on Aug 7, 2025. It is now read-only.

Upgrade to PyTorch 1.13 #1980

Merged
merged 23 commits into from
Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
40ce815
Changes to support PyTorch 1.13
agunapal Nov 16, 2022
cc414e1
Merge branch 'master' into upgrade_pytorch_113
agunapal Nov 17, 2022
08b4274
review comments
agunapal Nov 17, 2022
a50335d
Updated default CUDA version for docker to cu116
agunapal Nov 17, 2022
9eac44d
Updated default CUDA version for docker to cu116
agunapal Nov 17, 2022
6e15536
Updated default ubuntu version to be 20.04 wherever applicable
agunapal Nov 17, 2022
97f29ca
Updated default CUDA version to CUDA 11.6
agunapal Nov 17, 2022
795c769
Updated docker to CUDA 11.7 as default
agunapal Nov 18, 2022
e8b59ad
Removed ubuntu arg from docker build
agunapal Nov 18, 2022
169665b
Added github action for cpu regression tests
agunapal Nov 18, 2022
483bb5b
Added github action for cpu regression tests
agunapal Nov 18, 2022
932e00c
Added github action for cpu regression tests
agunapal Nov 18, 2022
48211bb
Added github action for cpu regression tests
agunapal Nov 18, 2022
6d9c1d4
Added gpu regression tests action
agunapal Nov 18, 2022
6c2accf
Added gpu regression tests action
agunapal Nov 18, 2022
bfabe0f
Merge branch 'master' into upgrade_pytorch_113
msaroufim Nov 18, 2022
ab09c41
change runner
agunapal Nov 18, 2022
97413a0
Merge branch 'upgrade_pytorch_113' of https://github.com/pytorch/serv…
agunapal Nov 18, 2022
a6e30a2
added java 17 to github actions
agunapal Nov 18, 2022
cd1f098
update git version
agunapal Nov 18, 2022
c73137e
Verified GPU regression tests to be working
agunapal Nov 18, 2022
ba6b4d0
Skipping regression tests on windows
agunapal Nov 18, 2022
23fb33b
Skipping regression tests on windows
agunapal Nov 18, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/regression_tests_cpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Run Regression Tests on CPU

on: workflow_dipatch

jobs:
regression-cpu:
# creates workflows for OS: ubuntu, macOS
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04, macOS-latest]
steps:
- name: Setup Python 3.8
uses: actions/setup-python@v3
with:
python-version: 3.8
architecture: x64
- name: Setup Java 17
uses: actions/setup-java@v3
with:
distribution: 'zulu'
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev
- name: Torchserve Regression Tests
run: |
python test/regression_tests.py
43 changes: 43 additions & 0 deletions .github/workflows/regression_tests_gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Run Regression Tests on GPU

on: workflow_dispatch

jobs:
regression-gpu:
# creates workflows for CUDA 11.6 & CUDA 11.7 on ubuntu
runs-on: [self-hosted, ci-gpu]
strategy:
fail-fast: false
matrix:
cuda: ["cu116", "cu117"]
steps:
- name: Clean up previous run
run: |
echo "Cleaning up previous run"
cd $RUNNER_WORKSPACE
pwd
cd ..
pwd
rm -rf _tool
- name: Update git
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt-get update && sudo apt-get install git -y
- name: Check git version
run: git --version
- name: Setup Python 3.8
uses: actions/setup-python@v3
with:
python-version: 3.8
architecture: x64
- name: Setup Java 17
uses: actions/setup-java@v3
with:
distribution: 'zulu'
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev --cuda=${{ matrix.cuda }}
- name: Torchserve Regression Tests
run: |
python test/regression_tests.py
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Your contributions will fall into two categories:
```bash
python ts_scripts/install_dependencies.py --environment=dev --cuda=cu102
```
> Supported cuda versions as cu116, cu113, cu111, cu102, cu101, cu92
> Supported cuda versions as cu117, cu116, cu113, cu111, cu102, cu101, cu92
- Install `pre-commit` to your Git flow:
```bash
pre-commit install
Expand Down
132 changes: 83 additions & 49 deletions benchmarks/auto_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
import argparse
import datetime
import os
import ruamel.yaml
import shutil
from subprocess import Popen
from utils import gen_model_config_json
from utils import gen_md_report
from utils import gen_metrics_json

import ruamel.yaml
from utils import gen_md_report, gen_metrics_json, gen_model_config_json

CWD = os.getcwd()
MODEL_JSON_CONFIG_PATH = CWD + '/model_json_config'
BENCHMARK_TMP_PATH = '/tmp/benchmark'
BENCHMARK_REPORT_PATH = '/tmp/ts_benchmark'
TS_LOGS_PATH = CWD + '/logs'
MODEL_STORE = '/tmp/model_store'
WF_STORE = '/tmp/wf_store'
MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config"
BENCHMARK_TMP_PATH = "/tmp/benchmark"
BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark"
TS_LOGS_PATH = CWD + "/logs"
MODEL_STORE = "/tmp/model_store"
WF_STORE = "/tmp/wf_store"


class BenchmarkConfig:
def __init__(self, yaml_dict, skip_ts_install):
self.yaml_dict = yaml_dict
self.skip_ts_install = skip_ts_install
self.bm_config = {}
yesterday = datetime.date.today() - datetime.timedelta(days=1)
self.bm_config["version"] = \
"torchserve-nightly=={}.{}.{}".format(yesterday.year, yesterday.month, yesterday.day)
self.bm_config["hardware"] = 'cpu'
self.bm_config["version"] = "torchserve-nightly=={}.{}.{}".format(
yesterday.year, yesterday.month, yesterday.day
)
self.bm_config["hardware"] = "cpu"

def ts_version(self, version):
for k, v in version.items():
Expand All @@ -48,15 +49,15 @@ def metrics_cmd(self, cmd):
for k, v in key_value.items():
if k == "cmd":
cmd_options.append(v)
elif k == '--namespace':
elif k == "--namespace":
cmd_options.append(k)
cmd_options.append(''.join(v))
cmd_options.append("".join(v))
else:
cmd_options.append(k)
cmd_options.append(v)
break

self.bm_config["metrics_cmd"] = ' '.join(cmd_options)
self.bm_config["metrics_cmd"] = " ".join(cmd_options)

def report_cmd(self, cmd):
cmd_options = []
Expand All @@ -70,12 +71,14 @@ def report_cmd(self, cmd):
today = datetime.date.today()
v[i] = "{}-{}-{}".format(today.year, today.month, today.day)
break
cmd_options.append('{}/{}'.format('/'.join(v), self.bm_config["version"]))
cmd_options.append(
"{}/{}".format("/".join(v), self.bm_config["version"])
)
else:
cmd_options.append(v)
break

self.bm_config["report_cmd"] = ' '.join(cmd_options)
self.bm_config["report_cmd"] = " ".join(cmd_options)

def load_config(self):
report_cmd = None
Expand All @@ -91,10 +94,11 @@ def load_config(self):
elif k == "report_cmd":
report_cmd = v

self.bm_config["model_config_path"] = \
'{}/cpu'.format(MODEL_JSON_CONFIG_PATH) \
if self.bm_config["hardware"] == 'cpu' \
else '{}/gpu'.format(MODEL_JSON_CONFIG_PATH)
self.bm_config["model_config_path"] = (
"{}/cpu".format(MODEL_JSON_CONFIG_PATH)
if self.bm_config["hardware"] == "cpu"
else "{}/gpu".format(MODEL_JSON_CONFIG_PATH)
)

if self.skip_ts_install:
self.bm_config["version"] = get_torchserve_version()
Expand All @@ -105,67 +109,75 @@ def load_config(self):
for k, v in self.bm_config.items():
print("{}={}".format(k, v))


def load_benchmark_config(bm_config_path, skip_ts_install):
yaml = ruamel.yaml.YAML()
with open(bm_config_path, 'r') as f:
with open(bm_config_path, "r") as f:
yaml_dict = yaml.load(f)

benchmark_config = BenchmarkConfig(yaml_dict, skip_ts_install)
benchmark_config.load_config()

return benchmark_config.bm_config


def benchmark_env_setup(bm_config, skip_ts_install):
install_torchserve(skip_ts_install, bm_config["hardware"], bm_config["version"])
setup_benchmark_path(bm_config["model_config_path"])
build_model_json_config(bm_config["models"])


def install_torchserve(skip_ts_install, hw, ts_version):
if skip_ts_install:
return

# git checkout branch if it is needed
cmd = 'git checkout master && git reset --hard && git clean -dffx . && git pull --rebase'
cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase"
execute(cmd, wait=True)
print("successfully reset git")

ts_install_cmd = None
if ts_version.startswith("torchserve==") or ts_version.startswith("torchserve-nightly=="):
ts_install_cmd = 'pip install {}'.format(ts_version)
if ts_version.startswith("torchserve==") or ts_version.startswith(
"torchserve-nightly=="
):
ts_install_cmd = "pip install {}".format(ts_version)
else:
cmd = 'git checkout {}'.format(ts_version)
cmd = "git checkout {}".format(ts_version)
execute(cmd, wait=True)

# install_dependencies.py
if hw == 'gpu':
cmd = 'python ts_scripts/install_dependencies.py --environment dev --cuda cu102'
if hw == "gpu":
cmd = "python ts_scripts/install_dependencies.py --environment dev --cuda cu117"
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed default CUDA version for benchmarking to CUDA 11.7 @lxning

else:
cmd = 'python ts_scripts/install_dependencies.py --environment dev'
cmd = "python ts_scripts/install_dependencies.py --environment dev"
execute(cmd, wait=True)
print("successfully install install_dependencies.py")

# install torchserve
if ts_install_cmd is None:
ts_install_cmd = 'python ts_scripts/install_from_src.py'
ts_install_cmd = "python ts_scripts/install_from_src.py"
execute(ts_install_cmd, wait=True)
print("successfully install torchserve")


def setup_benchmark_path(model_config_path):
benchmark_path_list = [BENCHMARK_TMP_PATH, BENCHMARK_REPORT_PATH, model_config_path]
for benchmark_path in benchmark_path_list:
shutil.rmtree(benchmark_path, ignore_errors=True)
os.makedirs(benchmark_path, exist_ok=True)

print('successfully setup benchmark_path={}'.format(benchmark_path))
print("successfully setup benchmark_path={}".format(benchmark_path))


def build_model_json_config(models):
for model in models:
if model.startswith('/'):
if model.startswith("/"):
input_file = model
else:
input_file = CWD + '/benchmarks/models_config/{}'.format(model)
input_file = CWD + "/benchmarks/models_config/{}".format(model)
gen_model_config_json.convert_yaml_to_json(input_file, MODEL_JSON_CONFIG_PATH)


def run_benchmark(bm_config):
files = os.listdir(bm_config["model_config_path"])
files.sort()
Expand All @@ -174,67 +186,84 @@ def run_benchmark(bm_config):
# call benchmark-ab.py
shutil.rmtree(TS_LOGS_PATH, ignore_errors=True)
shutil.rmtree(BENCHMARK_TMP_PATH, ignore_errors=True)
cmd = 'python ./benchmarks/benchmark-ab.py --tmp_dir /tmp --report_location /tmp --config_properties ' \
'./benchmarks/config.properties --config {}/{}'\
.format(bm_config["model_config_path"], model_json_config)
cmd = (
"python ./benchmarks/benchmark-ab.py --tmp_dir /tmp --report_location /tmp --config_properties "
"./benchmarks/config.properties --config {}/{}".format(
bm_config["model_config_path"], model_json_config
)
)
execute(cmd, wait=True)

# generate stats metrics from ab_report.csv
bm_model = model_json_config[0: -len('.json')]
bm_model = model_json_config[0 : -len(".json")]

gen_metrics_json.gen_metric(
'{}/ab_report.csv'.format(BENCHMARK_TMP_PATH),
'{}/logs/stats_metrics.json'.format(BENCHMARK_TMP_PATH)
"{}/ab_report.csv".format(BENCHMARK_TMP_PATH),
"{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH),
)

# load stats metrics to remote metrics storage
if "metrics_cmd" in bm_config:
execute(bm_config["metrics_cmd"], wait=True)

# cp benchmark logs to local
bm_model_log_path = '{}/{}'.format(BENCHMARK_REPORT_PATH, bm_model)
bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model)
os.makedirs(bm_model_log_path, exist_ok=True)
csv_file = '{}/ab_report.csv'.format(BENCHMARK_TMP_PATH)
csv_file = "{}/ab_report.csv".format(BENCHMARK_TMP_PATH)
if os.path.exists(csv_file):
shutil.move(csv_file, bm_model_log_path)
cmd = 'tar -cvzf {}/benchmark.tar.gz {}'.format(bm_model_log_path, BENCHMARK_TMP_PATH)
cmd = "tar -cvzf {}/benchmark.tar.gz {}".format(
bm_model_log_path, BENCHMARK_TMP_PATH
)
execute(cmd, wait=True)

cmd = 'tar -cvzf {}/logs.tar.gz {}'.format(bm_model_log_path, TS_LOGS_PATH)
cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH)
execute(cmd, wait=True)
print("finish benchmark {}".format(bm_model))

# generate final report
gen_md_report.iterate_subdir(
BENCHMARK_REPORT_PATH,
'{}/report.md'.format(BENCHMARK_REPORT_PATH),
"{}/report.md".format(BENCHMARK_REPORT_PATH),
bm_config["hardware"],
bm_config["version"])
bm_config["version"],
)
print("report.md is generated")

# load logs to remote storage
if "report_cmd" in bm_config:
execute(bm_config["report_cmd"], wait=True)


def clean_up_benchmark_env(bm_config):
shutil.rmtree(BENCHMARK_TMP_PATH, ignore_errors=True)
shutil.rmtree(MODEL_JSON_CONFIG_PATH, ignore_errors=True)
shutil.rmtree(MODEL_STORE, ignore_errors=True)
shutil.rmtree(WF_STORE, ignore_errors=True)


def execute(command, wait=False, stdout=None, stderr=None, shell=True):
print("execute: {}".format(command))
cmd = Popen(command, shell=shell, close_fds=True, stdout=stdout, stderr=stderr, universal_newlines=True)
cmd = Popen(
command,
shell=shell,
close_fds=True,
stdout=stdout,
stderr=stderr,
universal_newlines=True,
)
if wait:
cmd.wait()
return cmd


def get_torchserve_version():
# fetch the torchserve version from version.txt file
with open(os.path.join(CWD, 'ts', 'version.txt'), 'r') as file:
with open(os.path.join(CWD, "ts", "version.txt"), "r") as file:
version = file.readline().rstrip()
return version


def main():
parser = argparse.ArgumentParser()

Expand All @@ -250,12 +279,17 @@ def main():
)

arguments = parser.parse_args()
skip_ts_config = False if arguments.skip is not None and arguments.skip.lower() == "false" else True
skip_ts_config = (
False
if arguments.skip is not None and arguments.skip.lower() == "false"
else True
)
bm_config = load_benchmark_config(arguments.input, skip_ts_config)
benchmark_env_setup(bm_config, skip_ts_config)
run_benchmark(bm_config)
clean_up_benchmark_env(bm_config)
print("benchmark_serving.sh finished successfully.")


if __name__ == "__main__":
main()
Loading