Skip to content

Commit cd6b9cf

Browse files
authored
Support govcloud (#2118)
1 parent 2b76a1e commit cd6b9cf

File tree

20 files changed

+643
-73
lines changed

20 files changed

+643
-73
lines changed

build/generate_ami_mapping.go

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,10 @@ func SupportedRegions() []string {
169169
RegionMESouth1,
170170
RegionSAEast1,
171171
RegionAFSouth1,
172+
RegionUSGovWest1,
173+
RegionUSGovEast1,
172174
// RegionCNNorthwest1,
173175
// RegionCNNorth1,
174-
// RegionUSGovWest1,
175-
// RegionUSGovEast1,
176176
}
177177
}
178178

@@ -200,10 +200,38 @@ func EKSResourceAccountID(region string) string {
200200
}
201201

202202
func main() {
203-
destFile := mustExtractArg()
203+
if len(os.Args) > 3 {
204+
fmt.Println("usage: go run generate_ami_mapping.go <abs_dest_path> public|govcloud")
205+
os.Exit(1)
206+
}
207+
208+
destFile := os.Args[1]
209+
cloudType := os.Args[2]
210+
211+
if cloudType != "public" && cloudType != "govcloud" {
212+
log.Fatalf("%s is not a valid value; specify public or govcloud", cloudType)
213+
}
214+
215+
k8sVersionMap := map[string]map[string]map[string]string{}
216+
217+
if _, err := os.Stat(destFile); !os.IsNotExist(err) {
218+
jsonBytes, err := ioutil.ReadFile(destFile)
219+
if err != nil {
220+
log.Fatal(err.Error())
221+
}
222+
json.Unmarshal(jsonBytes, &k8sVersionMap)
223+
}
224+
204225
k8sVersion := "1.18"
205-
regions := map[string]map[string]string{}
226+
227+
if k8sVersionMap[k8sVersion] == nil {
228+
k8sVersionMap[k8sVersion] = map[string]map[string]string{}
229+
}
206230
for _, region := range SupportedRegions() {
231+
if (cloudType == "govcloud") != (region == RegionUSGovEast1 || region == RegionUSGovWest1) {
232+
// cloudType == "govcloud" xor (region is us govclouds)
233+
continue
234+
}
207235
fmt.Print(region)
208236
sess := session.New(&aws.Config{Region: aws.String(region)})
209237
svc := ec2.New(sess)
@@ -215,15 +243,17 @@ func main() {
215243
if err != nil {
216244
log.Fatal(err.Error())
217245
}
218-
regions[region] = map[string]string{
246+
247+
if k8sVersionMap[k8sVersion][region] == nil {
248+
k8sVersionMap[k8sVersion][region] = map[string]string{}
249+
}
250+
k8sVersionMap[k8sVersion][region] = map[string]string{
219251
"cpu": cpuAMI,
220252
"accelerated": acceleratedAMI,
221253
}
222254
fmt.Println(" ✓")
223255
}
224256

225-
k8sVersionMap := map[string]interface{}{}
226-
k8sVersionMap[k8sVersion] = regions
227257
marshalledBytes, err := json.MarshalIndent(k8sVersionMap, "", "\t")
228258
if err != nil {
229259
log.Fatal(err.Error())
@@ -288,12 +318,3 @@ func FindImage(ec2api ec2iface.EC2API, ownerAccount, namePattern string) (string
288318

289319
return *output.Images[0].ImageId, nil
290320
}
291-
292-
func mustExtractArg() string {
293-
if len(os.Args) != 2 {
294-
fmt.Println("usage: go run generate_ami_mapping.go <abs_dest_path>")
295-
os.Exit(1)
296-
}
297-
298-
return os.Args[1]
299-
}

dev/create_user.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,18 @@
3737

3838
user_name = f"dev-{cluster_name}-{cortex_region}"
3939

40-
iam_client = boto3.client("iam")
40+
iam_client = boto3.client("iam", region_name=cortex_region)
4141

4242
try:
4343
iam_client.get_user(UserName=user_name)
4444
except iam_client.exceptions.NoSuchEntityException:
4545
iam_client.create_user(UserName=user_name)
4646

47-
policy_arn = f"arn:aws:iam::{account_id}:policy/{user_name}"
47+
partition = "aws"
48+
if "us-gov" in cortex_region:
49+
partition = "aws-us-gov"
50+
51+
policy_arn = f"arn:{partition}:iam::{account_id}:policy/{user_name}"
4852

4953
try:
5054
iam_client.get_policy(PolicyArn=policy_arn)

dev/minimum_aws_policy.json

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@
4343
"logs:CreateLogGroup"
4444
],
4545
"Resource": [
46-
"arn:aws:ssm:*:$CORTEX_ACCOUNT_ID:parameter/aws/*",
47-
"arn:aws:ssm:*::parameter/aws/*",
48-
"arn:aws:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME",
49-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/*"
46+
"arn:*:ssm:*:$CORTEX_ACCOUNT_ID:parameter/aws/*",
47+
"arn:*:ssm:*::parameter/aws/*",
48+
"arn:*:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME",
49+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/*"
5050
]
5151
},
5252
{
@@ -81,12 +81,12 @@
8181
"iam:GetRolePolicy"
8282
],
8383
"Resource": [
84-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:instance-profile/eksctl-*",
85-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/eksctl-*",
86-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/aws-service-role/eks-nodegroup.amazonaws.com/AWSServiceRoleForAmazonEKSNodegroup",
87-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/eksctl-managed-*",
88-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:oidc-provider/*",
89-
"arn:aws:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME:*"
84+
"arn:*:iam::$CORTEX_ACCOUNT_ID:instance-profile/eksctl-*",
85+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/eksctl-*",
86+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/aws-service-role/eks-nodegroup.amazonaws.com/AWSServiceRoleForAmazonEKSNodegroup",
87+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/eksctl-managed-*",
88+
"arn:*:iam::$CORTEX_ACCOUNT_ID:oidc-provider/*",
89+
"arn:*:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME:*"
9090
]
9191
},
9292
{
@@ -99,7 +99,7 @@
9999
"iam:CreatePolicyVersion",
100100
"iam:DeletePolicyVersion"
101101
],
102-
"Resource": "arn:aws:iam::$CORTEX_ACCOUNT_ID:policy/cortex-*"
102+
"Resource": "arn:*:iam::$CORTEX_ACCOUNT_ID:policy/cortex-*"
103103
},
104104
{
105105
"Effect": "Allow",
@@ -123,17 +123,17 @@
123123
{
124124
"Effect": "Allow",
125125
"Action": "sqs:*",
126-
"Resource": "arn:aws:sqs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:cx-*"
126+
"Resource": "arn:*:sqs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:cx-*"
127127
},
128128
{
129129
"Effect": "Allow",
130130
"Action": "s3:*",
131-
"Resource": "arn:aws:s3:::$CORTEX_CLUSTER_NAME*"
131+
"Resource": "arn:*:s3:::$CORTEX_CLUSTER_NAME*"
132132
},
133133
{
134134
"Effect": "Allow",
135135
"Action": "s3:*",
136-
"Resource": "arn:aws:s3:::$CORTEX_CLUSTER_NAME*/*"
136+
"Resource": "arn:*:s3:::$CORTEX_CLUSTER_NAME*/*"
137137
}
138138
]
139139
}

dev/registry.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,19 @@ function registry_login() {
7777
if [ "$is_registry_logged_in" = "false" ]; then
7878
blue_echo "Logging in to ECR..."
7979
aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $registry_push_url
80-
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 790709498068.dkr.ecr.us-west-2.amazonaws.com # this is for the inferentia device plugin image
80+
81+
blue_echo "Logging in to 790709498068.dkr.ecr.us-west-2.amazonaws.com for inferentia..."
82+
set +e
83+
echo "$AWS_REGION" | grep "us-gov"
84+
is_gov_cloud=$?
85+
set -e
86+
if [ "$is_gov_cloud" == "0" ]; then
87+
# set NORMAL_REGION_AWS_ACCESS_KEY_ID and NORMAL_REGION_AWS_SECRET_ACCESS_KEY credentials from a regular AWS account (non govcloud) in your dev/config/env.sh
88+
AWS_ACCESS_KEY_ID=$NORMAL_REGION_AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$NORMAL_REGION_AWS_SECRET_ACCESS_KEY aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 790709498068.dkr.ecr.us-west-2.amazonaws.com
89+
else
90+
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 790709498068.dkr.ecr.us-west-2.amazonaws.com
91+
fi
92+
8193
is_registry_logged_in="true"
8294
green_echo "Success\n"
8395
fi

docs/clusters/management/auth.md

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,10 @@ Replace the following placeholders with their respective values in the policy te
108108
"logs:CreateLogGroup"
109109
],
110110
"Resource": [
111-
"arn:aws:ssm:*:$CORTEX_ACCOUNT_ID:parameter/aws/*",
112-
"arn:aws:ssm:*::parameter/aws/*",
113-
"arn:aws:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME",
114-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/*"
111+
"arn:*:ssm:*:$CORTEX_ACCOUNT_ID:parameter/aws/*",
112+
"arn:*:ssm:*::parameter/aws/*",
113+
"arn:*:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME",
114+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/*"
115115
]
116116
},
117117
{
@@ -146,12 +146,12 @@ Replace the following placeholders with their respective values in the policy te
146146
"iam:GetRolePolicy"
147147
],
148148
"Resource": [
149-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:instance-profile/eksctl-*",
150-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/eksctl-*",
151-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/aws-service-role/eks-nodegroup.amazonaws.com/AWSServiceRoleForAmazonEKSNodegroup",
152-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:role/eksctl-managed-*",
153-
"arn:aws:iam::$CORTEX_ACCOUNT_ID:oidc-provider/*",
154-
"arn:aws:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME:*"
149+
"arn:*:iam::$CORTEX_ACCOUNT_ID:instance-profile/eksctl-*",
150+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/eksctl-*",
151+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/aws-service-role/eks-nodegroup.amazonaws.com/AWSServiceRoleForAmazonEKSNodegroup",
152+
"arn:*:iam::$CORTEX_ACCOUNT_ID:role/eksctl-managed-*",
153+
"arn:*:iam::$CORTEX_ACCOUNT_ID:oidc-provider/*",
154+
"arn:*:logs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:log-group:$CORTEX_CLUSTER_NAME:*"
155155
]
156156
},
157157
{
@@ -164,7 +164,7 @@ Replace the following placeholders with their respective values in the policy te
164164
"iam:CreatePolicyVersion",
165165
"iam:DeletePolicyVersion"
166166
],
167-
"Resource": "arn:aws:iam::$CORTEX_ACCOUNT_ID:policy/cortex-*"
167+
"Resource": "arn:*:iam::$CORTEX_ACCOUNT_ID:policy/cortex-*"
168168
},
169169
{
170170
"Effect": "Allow",
@@ -188,17 +188,17 @@ Replace the following placeholders with their respective values in the policy te
188188
{
189189
"Effect": "Allow",
190190
"Action": "sqs:*",
191-
"Resource": "arn:aws:sqs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:cx-*"
191+
"Resource": "arn:*:sqs:$CORTEX_REGION:$CORTEX_ACCOUNT_ID:cx-*"
192192
},
193193
{
194194
"Effect": "Allow",
195195
"Action": "s3:*",
196-
"Resource": "arn:aws:s3:::$CORTEX_CLUSTER_NAME*"
196+
"Resource": "arn:*:s3:::$CORTEX_CLUSTER_NAME*"
197197
},
198198
{
199199
"Effect": "Allow",
200200
"Action": "s3:*",
201-
"Resource": "arn:aws:s3:::$CORTEX_CLUSTER_NAME*/*"
201+
"Resource": "arn:*:s3:::$CORTEX_CLUSTER_NAME*/*"
202202
}
203203
]
204204
}

manager/generate_eks.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,17 @@
2222
# kubelet config schema:
2323
# https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/kubelet/config/v1beta1/types.go
2424
def default_nodegroup(cluster_config):
25+
partition = "aws"
26+
if "us-gov" in cluster_config["region"]:
27+
partition = "aws-us-gov"
2528
return {
2629
"iam": {
2730
"withAddonPolicies": {"autoScaler": True},
2831
"attachPolicyARNs": [
29-
"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy",
30-
"arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy",
31-
"arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly",
32-
"arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess",
32+
f"arn:{partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy",
33+
f"arn:{partition}:iam::aws:policy/AmazonEKS_CNI_Policy",
34+
f"arn:{partition}:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly",
35+
f"arn:{partition}:iam::aws:policy/ElasticLoadBalancingFullAccess",
3336
cluster_config["cortex_policy_arn"],
3437
]
3538
+ cluster_config.get("iam_policy_arns", []),

manager/install.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ function setup_configmap() {
208208
kubectl -n=default create configmap 'env-vars' \
209209
--from-literal='CORTEX_VERSION'=$CORTEX_VERSION \
210210
--from-literal='CORTEX_REGION'=$CORTEX_REGION \
211-
--from-literal='AWS_REGION'=$CORTEX_REGION \
211+
--from-literal='AWS_DEFAULT_REGION'=$CORTEX_REGION \
212212
--from-literal='CORTEX_TELEMETRY_DISABLE'=$CORTEX_TELEMETRY_DISABLE \
213213
--from-literal='CORTEX_TELEMETRY_SENTRY_DSN'=$CORTEX_TELEMETRY_SENTRY_DSN \
214214
--from-literal='CORTEX_TELEMETRY_SEGMENT_WRITE_KEY'=$CORTEX_TELEMETRY_SEGMENT_WRITE_KEY \

manager/manifests/ami.json

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@
4141
"cpu": "ami-0f85d2eeb0bea62a7"
4242
},
4343
"eu-north-1": {
44-
"accelerated": "ami-016de826d1d553d1b",
45-
"cpu": "ami-05dc6dcd932a8159e"
44+
"accelerated": "ami-05bc92c8d24c7a661",
45+
"cpu": "ami-09f6f77efde8920b8"
4646
},
4747
"eu-south-1": {
4848
"accelerated": "ami-0ce76bb81c438e3b9",
@@ -76,6 +76,14 @@
7676
"accelerated": "ami-0da728680ca5ee572",
7777
"cpu": "ami-0576aabae1709e005"
7878
},
79+
"us-gov-east-1": {
80+
"accelerated": "ami-0d0129213a7f16d12",
81+
"cpu": "ami-090c13eee66020d9b"
82+
},
83+
"us-gov-west-1": {
84+
"accelerated": "ami-06439eec0732c6baa",
85+
"cpu": "ami-04bd686c969f13917"
86+
},
7987
"us-west-1": {
8088
"accelerated": "ami-085fdad868b9007dd",
8189
"cpu": "ami-09bec0a8c8d4925a6"

pkg/cortex/serve/cortex_internal/serve/serve.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,6 @@ def start_fn():
262262
project_dir = os.environ["CORTEX_PROJECT_DIR"]
263263
spec_path = os.environ["CORTEX_API_SPEC"]
264264
model_dir = os.getenv("CORTEX_MODEL_DIR")
265-
266265
host_ip = os.environ["HOST_IP"]
267266
tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000")
268267
tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost")

pkg/cortex/serve/init/script.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def main():
121121
# get API spec
122122
spec_path = os.environ["CORTEX_API_SPEC"]
123123
cache_dir = os.getenv("CORTEX_CACHE_DIR")
124-
region = os.getenv("AWS_REGION") # when it's deployed to AWS
124+
region = os.getenv("AWS_DEFAULT_REGION") # when it's deployed to AWS
125125

126126
with open(spec_path) as json_file:
127127
api_spec = json.load(json_file)

pkg/cortex/serve/start/async_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def main():
104104
workload_path = os.environ["CORTEX_ASYNC_WORKLOAD_PATH"]
105105
project_dir = os.environ["CORTEX_PROJECT_DIR"]
106106
readiness_file = os.getenv("CORTEX_READINESS_FILE", "/mnt/workspace/api_readiness.txt")
107-
region = os.getenv("AWS_REGION")
107+
region = os.getenv("AWS_DEFAULT_REGION")
108108
queue_url = os.environ["CORTEX_QUEUE_URL"]
109109
statsd_host = os.getenv("HOST_IP")
110110
statsd_port = os.getenv("CORTEX_STATSD_PORT", "9125")

pkg/cortex/serve/start/batch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def start():
183183
tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000")
184184
tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost")
185185

186-
region = os.getenv("AWS_REGION")
186+
region = os.getenv("AWS_DEFAULT_REGION")
187187

188188
has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS")
189189
if has_multiple_servers:

pkg/cortex/serve/start/server_grpc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def init():
147147

148148
model_dir = os.getenv("CORTEX_MODEL_DIR")
149149
cache_dir = os.getenv("CORTEX_CACHE_DIR")
150-
region = os.getenv("AWS_REGION")
150+
region = os.getenv("AWS_DEFAULT_REGION")
151151

152152
host_ip = os.environ["HOST_IP"]
153153
tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000")

pkg/cortex/serve/start/task.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
def start():
3030
project_dir = os.environ["CORTEX_PROJECT_DIR"]
31-
3231
api_spec_path = os.environ["CORTEX_API_SPEC"]
3332
task_spec_path = os.environ["CORTEX_TASK_SPEC"]
3433

pkg/lib/aws/errors.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ func ErrorDashboardHeightOutOfRange(height int) error {
190190
func ErrorRegionNotConfigured() error {
191191
return errors.WithStack(&errors.Error{
192192
Kind: ErrRegionNotConfigured,
193-
Message: "aws region has not been configured; please set a default region (e.g. `export AWS_REGION=us-west-2`)",
193+
Message: "aws region has not been configured; please set a default region (e.g. `export AWS_DEFAULT_REGION=us-west-2`)",
194194
})
195195
}
196196

pkg/lib/aws/gen_resource_metadata.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
# https://docs.aws.amazon.com/general/latest/gr/eks.html
2020
# China regions don't seem to support these endpoints (yet?)
21-
# GovCloud is skipped
2221
REGIONS = [
2322
"us-east-2", # Ohio
2423
"us-east-1", # N. Virginia
@@ -41,6 +40,8 @@
4140
"eu-north-1", # Stockholm
4241
"me-south-1", # Bahrain
4342
"sa-east-1", # Sao Paulo
43+
"us-gov-east-1", # GovCloud US-East
44+
"us-gov-west-1", # GovCloud US-West
4445
]
4546

4647
OUTPUT_FILE_NAME = "resource_metadata.go"

0 commit comments

Comments
 (0)