Skip to content

Commit 6efe211

Browse files
authored
Merge branch 'master' into master
2 parents 7477ba7 + 4ad8986 commit 6efe211

File tree

13 files changed

+664
-22
lines changed

13 files changed

+664
-22
lines changed

ci/buildspec_cpu.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ phases:
77
commands:
88
- apt-get update
99
- apt-get install sudo -y
10-
- python ts_scripts/install_dependencies.py --environment=dev
10+
- pip install -r ci/launcher/requirements.txt
1111

1212
build:
1313
commands:
14-
- python torchserve_sanity.py
15-
- cd serving-sdk/ && mvn clean install -q && cd ../
14+
- python ci/launcher/launch_test.py --instance-type c5.18xlarge

ci/buildspec_cpu_backup.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Build Spec for AWS CodeBuild CI
2+
3+
version: 0.2
4+
5+
phases:
6+
install:
7+
commands:
8+
- apt-get update
9+
- apt-get install sudo -y
10+
- python ts_scripts/install_dependencies.py --environment=dev
11+
12+
build:
13+
commands:
14+
- python torchserve_sanity.py
15+
- cd serving-sdk/ && mvn clean install -q && cd ../

ci/buildspec_gpu.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ phases:
77
commands:
88
- apt-get update
99
- apt-get install sudo -y
10-
- python ts_scripts/install_dependencies.py --cuda=cu102 --environment=dev
10+
- pip install -r ci/launcher/requirements.txt
1111

1212
build:
1313
commands:
14-
- python torchserve_sanity.py
15-
- cd serving-sdk/ && mvn clean install -q && cd ../
14+
- python ci/launcher/launch_test.py --instance-type p3.2xlarge

ci/buildspec_gpu_backup.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Build Spec for AWS CodeBuild CI
2+
3+
version: 0.2
4+
5+
phases:
6+
install:
7+
commands:
8+
- apt-get update
9+
- apt-get install sudo -y
10+
- python ts_scripts/install_dependencies.py --cuda=cu102 --environment=dev
11+
12+
build:
13+
commands:
14+
- python torchserve_sanity.py
15+
- cd serving-sdk/ && mvn clean install -q && cd ../

ci/launcher/launch_test.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import argparse
2+
import boto3
3+
import datetime
4+
import random
5+
import subprocess
6+
import os
7+
import time
8+
9+
10+
from botocore.config import Config
11+
from fabric2 import Connection
12+
from invoke import run
13+
14+
from utils import LOGGER, GPU_INSTANCES
15+
from utils import ec2 as ec2_utils
16+
17+
CPU_INSTANCE_COMMANDS_LIST = [
18+
"python3 ts_scripts/install_dependencies.py --environment=dev",
19+
"python3 torchserve_sanity.py",
20+
"cd serving-sdk/ && mvn clean install -q && cd ../",
21+
]
22+
23+
GPU_INSTANCE_COMMANDS_LIST = [
24+
"python3 ts_scripts/install_dependencies.py --environment=dev --cuda=cu102",
25+
"python3 torchserve_sanity.py",
26+
"cd serving-sdk/ && mvn clean install -q && cd ../",
27+
]
28+
29+
30+
def run_commands_on_ec2_instance(ec2_connection, is_gpu):
31+
"""
32+
This function assumes that the required 'serve' folder is already available on the ec2 instance in the home directory.
33+
Returns a map of the command executed and return value of that command.
34+
"""
35+
36+
command_result_map = {}
37+
38+
virtual_env_name = "venv"
39+
40+
with ec2_connection.cd(f"/home/ubuntu/serve"):
41+
ec2_connection.run(f"python3 -m venv {virtual_env_name}")
42+
with ec2_connection.prefix(f"source {virtual_env_name}/bin/activate"):
43+
commands_list = GPU_INSTANCE_COMMANDS_LIST if is_gpu else CPU_INSTANCE_COMMANDS_LIST
44+
45+
for command in commands_list:
46+
LOGGER.info(f"*** Executing command on ec2 instance: {command}")
47+
ret_obj = ec2_connection.run(
48+
command,
49+
echo=True,
50+
warn=True,
51+
pty=True,
52+
shell="/bin/bash",
53+
env={"LC_CTYPE": "en_US.utf8", "JAVA_HOME": "/usr/lib/jvm/java-11-openjdk-amd64"},
54+
)
55+
56+
if ret_obj.return_code != 0:
57+
LOGGER.error(f"*** Failed command: {command}")
58+
LOGGER.error(f"*** Failed command stdout: {ret_obj.stdout}")
59+
LOGGER.error(f"*** Failed command stderr: {ret_obj.stderr}")
60+
61+
command_result_map[command] = ret_obj.return_code
62+
63+
return command_result_map
64+
65+
66+
def launch_ec2_instance(region, instance_type, ami_id):
67+
"""
68+
Note: This function relies on CODEBUILD environment variables. If this function is used outside of CODEBUILD,
69+
modify the function accordingly.
70+
Spins up an ec2 instance, clones the current Github Pull Request commit id on the instance, and runs sanity test on it.
71+
Prints the output of the command executed.
72+
"""
73+
github_repo = os.environ.get("CODEBUILD_SOURCE_REPO_URL", "https://github.com/pytorch/serve.git").strip()
74+
github_pr_commit_id = os.environ.get("CODEBUILD_RESOLVED_SOURCE_VERSION", "HEAD").strip()
75+
github_hookshot = os.environ.get("CODEBUILD_SOURCE_VERSION", "local-start").strip()
76+
github_hookshot = github_hookshot.replace("/", "-")
77+
github_pull_request_number = github_hookshot.split("-")[1]
78+
79+
ec2_client = boto3.client("ec2", config=Config(retries={"max_attempts": 10}), region_name=region)
80+
random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
81+
ec2_key_name = f"{github_hookshot}-ec2-instance-{random.randint(1, 1000)}"
82+
83+
# Spin up ec2 instance and run tests
84+
try:
85+
key_file = ec2_utils.generate_ssh_keypair(ec2_client, ec2_key_name)
86+
instance_details = ec2_utils.launch_instance(
87+
ami_id,
88+
instance_type,
89+
ec2_key_name=ec2_key_name,
90+
region=region,
91+
user_data=None,
92+
iam_instance_profile_name=ec2_utils.EC2_INSTANCE_ROLE_NAME,
93+
instance_name=ec2_key_name,
94+
)
95+
96+
instance_id = instance_details["InstanceId"]
97+
ip_address = ec2_utils.get_public_ip(instance_id, region=region)
98+
99+
LOGGER.info(f"*** Waiting on instance checks to complete...")
100+
ec2_utils.check_instance_state(instance_id, state="running", region=region)
101+
ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region)
102+
LOGGER.info(f"*** Instance checks complete. Running commands on instance.")
103+
104+
# Create a fabric connection to the ec2 instance.
105+
ec2_connection = ec2_utils.get_ec2_fabric_connection(instance_id, key_file, region)
106+
107+
LOGGER.info(f"Running update command. This could take a while.")
108+
ec2_connection.run(f"sudo apt update")
109+
110+
# Update command takes a while to run, and should ideally run uninterrupted
111+
time.sleep(300)
112+
113+
with ec2_connection.cd("/home/ubuntu"):
114+
LOGGER.info(f"*** Cloning the PR related to {github_hookshot} on the ec2 instance.")
115+
ec2_connection.run(f"git clone {github_repo}")
116+
ec2_connection.run(
117+
f"cd serve && git fetch origin pull/{github_pull_request_number}/head:pull && git checkout pull"
118+
)
119+
120+
ec2_connection.run(f"sudo apt-get install -y python3-venv")
121+
# Following is necessary on Base Ubuntu DLAMI because the default python is python2
122+
# This will NOT fail for other AMI where default python is python3
123+
ec2_connection.run(
124+
f"sudo cp /usr/local/bin/pip3 /usr/local/bin/pip && pip install --upgrade pip", warn=True
125+
)
126+
ec2_connection.run(
127+
f"sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 1", warn=True
128+
)
129+
130+
is_gpu = True if instance_type[:2] in GPU_INSTANCES else False
131+
132+
command_return_value_map = run_commands_on_ec2_instance(ec2_connection, is_gpu)
133+
134+
if any(command_return_value_map.values()):
135+
raise ValueError(f"*** One of the commands executed on ec2 returned a non-zero value.")
136+
else:
137+
LOGGER.info(f"*** All commands executed successfully on ec2. command:return_value map is as follows:")
138+
LOGGER.info(command_return_value_map)
139+
140+
except ValueError as e:
141+
LOGGER.error(f"*** ValueError: {e}")
142+
LOGGER.error(f"*** Following commands had the corresponding return value:")
143+
LOGGER.error(command_return_value_map)
144+
raise e
145+
except Exception as e:
146+
LOGGER.error(f"*** Exception occured. {e}")
147+
raise e
148+
finally:
149+
LOGGER.warning(f"*** Terminating instance-id: {instance_id} with name: {ec2_key_name}")
150+
ec2_utils.terminate_instance(instance_id, region)
151+
LOGGER.warning(f"*** Destroying ssh key_pair: {ec2_key_name}")
152+
ec2_utils.destroy_ssh_keypair(ec2_client, ec2_key_name)
153+
154+
155+
def main():
156+
157+
parser = argparse.ArgumentParser()
158+
159+
parser.add_argument(
160+
"--instance-type",
161+
default="p3.2xlarge",
162+
help="Specify the instance type you want to run the test on. Default: p3.2xlarge",
163+
)
164+
165+
parser.add_argument(
166+
"--region",
167+
default="us-west-2",
168+
help="Specify the aws region in which you want associated ec2 instance to be spawned",
169+
)
170+
171+
parser.add_argument(
172+
"--ami-id",
173+
default="ami-032e40ca6b0973cf2",
174+
help="Specify an Ubuntu Base DLAMI only. This AMI type ships with nvidia drivers already setup. Using other AMIs might"
175+
"need non-trivial installations on the AMI. AMI-ids differ per aws region.",
176+
)
177+
178+
arguments = parser.parse_args()
179+
180+
instance_type = arguments.instance_type
181+
region = arguments.region
182+
ami_id = arguments.ami_id
183+
184+
launch_ec2_instance(region, instance_type, ami_id)
185+
186+
187+
if __name__ == "__main__":
188+
main()

ci/launcher/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
fabric2==2.5.0
2+
boto3
3+
retrying

ci/launcher/utils/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import logging
2+
import sys
3+
4+
LOGGER = logging.getLogger(__name__)
5+
LOGGER.setLevel(logging.INFO)
6+
LOGGER.addHandler(logging.StreamHandler(sys.stderr))
7+
8+
DEFAULT_REGION = "us-west-2"
9+
10+
GPU_INSTANCES = ["p2", "p3", "p4", "g2", "g3", "g4"]

0 commit comments

Comments
 (0)