Skip to content

Commit fd6f1b6

Browse files
committed
- NodeJS/npm is now managed via NVM (#64: Contributor @cfsnate)
- Fixed IAM policies required to install SOCA and added support for cdk boostrap (#64: Contributor @cfsnate) - More consistent way to install EPEL repository across distros - Better way to install SSM on the Scheduler host (similar to what we are already doing with ComputeNodes) - Updated remote job submission to fix error with group ownership when using a remote input file
1 parent 98b2a42 commit fd6f1b6

File tree

21 files changed

+144
-63
lines changed

21 files changed

+144
-63
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ installer/resources/src/installer_history.txt
1010
installer/resources/src/cdk.context.json
1111
installer/resources/src/cdk.out/
1212
installer/resources/src/envs/
13+
installer/resources/src/.requirements_installed
1314

1415
# C extensions
1516
*.so

README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ Scale-Out Computing on AWS project consists in a collection of CloudFormation te
1313
```bash
1414
.
1515
├── soca
16-
   ├── cluster_analytics [ Scripts to ingest cluster/job data into ELK ]
17-
   ├── cluster_hooks [ Scheduler Hooks ]
18-
   ├── cluster_logs_management [ Scripts to manage cluster log rotation ]
19-
   ├── cluster_manager [ Scripts to control Soca cluster ]
20-
   ├── cluster_web_ui [ Web Interface ]
21-
   └── cluster_node_bootstrap [ Script to configure compute nodes]
16+
├── cluster_analytics [ Scripts to ingest cluster/job data into ELK ]
17+
├── cluster_hooks [ Scheduler Hooks ]
18+
├── cluster_logs_management [ Scripts to manage cluster log rotation ]
19+
├── cluster_manager [ Scripts to control Soca cluster ]
20+
├── cluster_web_ui [ Web Interface ]
21+
└── cluster_node_bootstrap [ Script to configure compute nodes]
2222
└── scripts
2323
├── config.cfg [ List of all packages to install ]
2424
├── Scheduler.sh [ Configure Schedule Node ]
@@ -30,7 +30,7 @@ This solution collects anonymous operational metrics to help AWS improve the qua
3030

3131
***
3232

33-
Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
33+
Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3434

3535
Licensed under the Apache License, Version 2.0 (the "License");
3636
you may not use this file except in compliance with the License.

installer/Makefile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
ifeq (${QUIET_MODE},true)
3+
pip_flags := --quiet
4+
endif
5+
6+
resources/src/.requirements_installed: resources/src/requirements.txt
7+
@echo "Installing/upgrading required dependencies (this can take a couple of minutes)"
8+
pip3 install --upgrade pip ${pip_flags}
9+
pip3 install -r resources/src/requirements.txt ${pip_flags}
10+
touch $@

installer/SOCAInstallerIamPolicy.json

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,34 +42,33 @@
4242
"ec2:CreateNatGateway",
4343
"ec2:CreateNetworkInterface",
4444
"ec2:CreateNetworkInterfacePermission",
45-
"ec2:CreateNetworkInterfacePermission",
4645
"ec2:CreateRoute",
4746
"ec2:CreateRouteTable",
48-
"ec2:CreateRouteTable",
4947
"ec2:CreateSecurityGroup",
5048
"ec2:CreateSubnet",
5149
"ec2:CreateTags",
5250
"ec2:CreateVpc",
5351
"ec2:CreateVpcEndpoint",
52+
"ec2:DescribeAddresses",
5453
"ec2:DescribeAvailabilityZones",
5554
"ec2:DescribeFlowLogs",
5655
"ec2:DescribeInstances",
5756
"ec2:DescribeInternetGateways",
5857
"ec2:DescribeKeyPairs",
5958
"ec2:DescribeNatGateways",
6059
"ec2:DescribeNetworkInterfaces",
60+
"ec2:DescribeNetwork*",
6161
"ec2:DescribeRegions",
6262
"ec2:DescribeRouteTables",
63-
"ec2:DescribeRouteTables",
6463
"ec2:DescribeSecurityGroups",
6564
"ec2:DescribeSubnets",
65+
"ec2:DescribeVpcAttribute",
6666
"ec2:DescribeVpcEndpoints",
6767
"ec2:DescribeVpcs",
6868
"ec2:ModifySubnetAttribute",
6969
"ec2:ModifyVpcAttribute",
7070
"ec2:RevokeSecurityGroupEgress",
7171
"ec2:RunInstances",
72-
"ec2:describeAddresses",
7372
"elasticfilesystem:CreateFileSystem",
7473
"elasticfilesystem:CreateMountTarget",
7574
"elasticfilesystem:DescribeFileSystems",
@@ -90,7 +89,6 @@
9089
"elasticloadbalancing:ModifyRule",
9190
"elasticloadbalancing:RegisterTargets",
9291
"es:AddTags",
93-
"es:AddTags",
9492
"es:CreateElasticsearchDomain",
9593
"es:DescribeElasticsearchDomain",
9694
"fsx:CreateFileSystem",
@@ -123,7 +121,6 @@
123121
"route53resolver:CreateResolverEndpoint",
124122
"route53resolver:CreateResolverRule",
125123
"route53resolver:GetResolverEndpoint",
126-
"route53resolver:GetResolverEndpoint",
127124
"route53resolver:GetResolverRule",
128125
"route53resolver:GetResolverRuleAssociation",
129126
"route53resolver:PutResolverRulePolicy",
@@ -151,12 +148,10 @@
151148
"ds:DeleteDirectory",
152149
"ec2:DeleteFlowLogs",
153150
"ec2:DeleteInternetGateway",
154-
"ec2:DeleteInternetGateway",
155151
"ec2:DeleteNatGateway",
156152
"ec2:DeleteNetworkInterface",
157153
"ec2:DeleteRoute",
158154
"ec2:DeleteRouteTable",
159-
"ec2:DeleteRouteTable",
160155
"ec2:DeleteSecurityGroup",
161156
"ec2:DeleteSubnet",
162157
"ec2:DeleteVpc",
@@ -168,6 +163,7 @@
168163
"ec2:ReleaseAddress",
169164
"ec2:RevokeSecurityGroupIngress",
170165
"ec2:TerminateInstances",
166+
"ecr:DeleteRepository",
171167
"elasticfilesystem:DeleteFileSystem",
172168
"elasticfilesystem:DeleteMountTarget",
173169
"elasticloadbalancing:DeRegisterTargets",
@@ -179,18 +175,50 @@
179175
"iam:DeleteInstanceProfile",
180176
"iam:DeleteRole",
181177
"iam:DeleteRolePolicy",
178+
"iam:DeleteServiceLinkedRole",
182179
"iam:DetachRolePolicy",
183180
"iam:RemoveRoleFromInstanceProfile",
184181
"lambda:DeleteFunction",
185182
"lambda:RemovePermission",
186183
"logs:DeleteLogGroup",
187184
"route53resolver:DeleteResolverEndpoint",
188-
"route53resolver:DeleteResolverEndpoint",
189185
"route53resolver:DeleteResolverRule",
190186
"route53resolver:DisassociateResolverRule",
191-
"secretsmanager:DeleteSecret"
187+
"secretsmanager:DeleteSecret",
188+
"s3:DeleteBucketPolicy"
189+
],
190+
"Resource": "*"
191+
},
192+
{
193+
"Sid": "CreateCDKToolkitIfNeeded",
194+
"Effect": "Allow",
195+
"Action": [
196+
"s3:PutEncryptionConfiguration",
197+
"s3:PutBucketVersioning",
198+
"s3:PutBucketPublicAccessBlock",
199+
"s3:PutBucketPolicy",
200+
"s3:CreateBucket"
201+
],
202+
"Resource": "arn:aws:s3:::cdk*"
203+
},
204+
{
205+
"Sid": "CreateCDKToolkitIfNeeded2",
206+
"Effect": "Allow",
207+
"Action": [
208+
"ecr:CreateRepository",
209+
"ecr:DescribeRepositories",
210+
"ec2:DescribeAccountAttributes",
211+
"es:CreateServiceLinkedRole",
212+
"ssm:GetParameter",
213+
"ssm:GetParameters",
214+
"ssm:PutParameter",
215+
"ssm:DeleteParameter",
216+
"iam:CreateRole",
217+
"iam:CreateServiceLinkedRole",
218+
"iam:GetRole",
219+
"iam:PutRolePolicy"
192220
],
193221
"Resource": "*"
194222
}
195223
]
196-
}
224+
}

installer/default_config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Config:
2-
version: "v2.7.0" # version automatically populated as part of RELEASE-PIPELINE.sh (see github_release.py)
2+
version: "2.7.0" # version automatically populated as part of RELEASE-PIPELINE.sh (see github_release.py)
33
termination_protection: True # Enable (recommended) or Disable Cloudformation Stack termination protection
44
entry_points_subnets: "Public" # Public (recommended) or Private. In public mode the scheduler and ELB are deployed on PublicSubnets and assigned Public IPS. In Private mode scheduler and ELB are deployed in private subnets. In both case compute nodes and ElasticSearch/EFS/FSxL are deployed in private subnets. Public does not means your cluster will be accessible to everyone by default, access to your cluster is still protected by security groups
55

installer/resources/src/cdk_construct.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ def iam_roles(self):
390390
self.soca_resources["scheduler_role"] = iam.Role(self, "SchedulerRole", description="IAM role assigned to the scheduler host", assumed_by=iam.CompositePrincipal(iam.ServicePrincipal(principals_suffix["ssm"]), iam.ServicePrincipal(principals_suffix["ec2"])))
391391
self.soca_resources["compute_node_role"] = iam.Role(self, "ComputeNodeRole", description="IAM role assigned to the compute nodes", assumed_by=iam.CompositePrincipal(iam.ServicePrincipal(principals_suffix["ssm"]), iam.ServicePrincipal(principals_suffix["ec2"])))
392392
self.soca_resources["spot_fleet_role"] = iam.Role(self, "SpotFleetRole", description="IAM role to manage SpotFleet requests", assumed_by=iam.ServicePrincipal(principals_suffix["spotfleet"]))
393+
self.soca_resources["spot_fleet_role"].add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEC2SpotFleetTaggingRole"))
393394
self.soca_resources["compute_node_instance_profile"] = iam.CfnInstanceProfile(self, "ComputeNodeInstanceProfile", roles=[self.soca_resources["compute_node_role"].role_name])
394395
else:
395396
# Reference existing Scheduler/ComputeNode/SpotFleet roles
@@ -1077,4 +1078,4 @@ def viewer(self):
10771078
install = SOCAInstall(app, user_specified_variables.cluster_id, env=cdk_env,
10781079
description=f"SOCA cluster version {install_props.Config.version}",
10791080
termination_protection=install_props.Config.termination_protection)
1080-
app.synth()
1081+
app.synth()

installer/resources/src/install_soca.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ def validate_soca_config(user_specified_inputs, install_properties):
495495
os.chdir(install_directory)
496496

497497
# Append Solution ID to Boto3 Construct
498-
aws_solution_user_agent = {"user_agent_extra": "AwsSolution/SO0072/v2.7.0"}
498+
aws_solution_user_agent = {"user_agent_extra": "AwsSolution/SO0072/2.7.0"}
499499
boto_extra_config = config.Config(**aws_solution_user_agent)
500500

501501
print(f"""

installer/soca_installer.sh

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,25 +77,19 @@ if [[ -n $VIRTUAL_ENV ]]; then
7777
else
7878
# Check if Python Virtual environment exist
7979
# If not, create the venv and install required python libraries
80-
if [[ ! -d $PYTHON_VENV ]]; then
80+
if [[ ! -e $PYTHON_VENV/bin/activate ]]; then
8181
echo "No Python virtual environment found. Creating one ..."
82+
rm -rf $PYTHON_VENV
8283
$PYTHON -m venv $PYTHON_VENV
8384
# shellcheck disable=SC1090
8485
. "$PYTHON_VENV/bin/activate"
85-
echo "Installing/upgrading required dependencies (this can take a couple of minutes)"
86-
if [[ $QUIET_MODE = "true" ]]; then
87-
pip3 install --upgrade pip --quiet
88-
pip3 install -r resources/src/requirements.txt --quiet
89-
else
90-
pip3 install --upgrade pip
91-
pip3 install -r resources/src/requirements.txt
92-
fi
9386
else
9487
# Load Python environment
9588
echo "Loading Python Virtual Environment"
9689
source "$PYTHON_VENV/bin/activate"
9790
fi
9891
fi
92+
make resources/src/.requirements_installed
9993

10094
# Install local NodeJS environment and CDK
10195
if [[ ! -d $NVM_DIR ]]; then

source/scripts/Scheduler.sh

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@ if [[ $# -lt 2 ]]; then
2323
fi
2424

2525
# Install SSM
26-
yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm
27-
systemctl enable amazon-ssm-agent
28-
systemctl restart amazon-ssm-agent
26+
machine=$(uname -m)
27+
if ! systemctl status amazon-ssm-agent; then
28+
if [[ $machine == "x86_64" ]]; then
29+
yum install -y $SSM_X86_64_URL
30+
elif [[ $machine == "aarch64" ]]; then
31+
yum install -y $SSM_AARCH64_URL
32+
fi
33+
systemctl enable amazon-ssm-agent || true
34+
systemctl restart amazon-ssm-agent
35+
fi
2936

3037
mkdir -p /apps/soca/$SOCA_CONFIGURATION
3138
FS_DATA_PROVIDER=$1
@@ -37,22 +44,26 @@ SERVER_HOSTNAME=$(hostname)
3744
SERVER_HOSTNAME_ALT=$(echo $SERVER_HOSTNAME | cut -d. -f1)
3845
echo $SERVER_IP $SERVER_HOSTNAME $SERVER_HOSTNAME_ALT >> /etc/hosts
3946

40-
# Install Epel repo
41-
if [[ $SOCA_BASE_OS == "amazonlinux2" ]]; then
42-
sudo amazon-linux-extras install -y epel
47+
# Install System required libraries / EPEL
48+
if [[ $SOCA_BASE_OS == "rhel7" ]]; then
49+
# RHEL7
50+
curl "$EPEL_URL" -o $EPEL_RPM
51+
if [[ $(md5sum "$EPEL_RPM" | awk '{print $1}') != "$EPEL_HASH" ]]; then
52+
echo -e "FATAL ERROR: Checksum for EPEL failed. File may be compromised." > /etc/motd
53+
exit 1
54+
fi
55+
yum -y install $EPEL_RPM
56+
yum install -y $(echo ${SYSTEM_PKGS[*]} ${SCHEDULER_PKGS[*]}) --enablerepo rhui-REGION-rhel-server-optional
4357
elif [[ $SOCA_BASE_OS == "centos7" ]]; then
58+
# CentOS
4459
yum -y install epel-release
60+
yum install -y $(echo ${SYSTEM_PKGS[*]} ${SCHEDULER_PKGS[*]})
4561
else
46-
# rhel7
47-
yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
48-
fi
49-
50-
if [[ $SOCA_BASE_OS == "rhel7" ]]; then
51-
yum install -y $(echo ${SYSTEM_PKGS[*]} ${SCHEDULER_PKGS[*]}) --enablerepo rhui-REGION-rhel-server-optional
52-
else
53-
yum install -y $(echo ${SYSTEM_PKGS[*]} ${SCHEDULER_PKGS[*]})
62+
# AL2
63+
amazon-linux-extras install -y epel
64+
yum update --security
65+
yum install -y $(echo ${SYSTEM_PKGS[*]} ${SCHEDULER_PKGS[*]})
5466
fi
55-
5667
yum install -y $(echo ${OPENLDAP_SERVER_PKGS[*]} ${SSSD_PKGS[*]})
5768

5869
# Mount File system
@@ -63,7 +74,7 @@ if [[ "$FS_DATA_PROVIDER" == "fsx_lustre" ]] || [[ "$FS_APPS_PROVIDER" == "fsx_l
6374
if [[ -z "$(rpm -qa lustre-client)" ]]; then
6475
# Install FSx for Lustre Client
6576
if [[ "$SOCA_BASE_OS" == "amazonlinux2" ]]; then
66-
sudo amazon-linux-extras install -y lustre2.10
77+
amazon-linux-extras install -y lustre2.10
6778
else
6879
kernel=$(uname -r)
6980
machine=$(uname -m)

source/scripts/SchedulerPostReboot.sh

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,11 +302,30 @@ fi
302302
IFS="-" read name sanitized_cluster_name <<< "echo $SOCA_CONFIGURATION"
303303
sed -i "s/__SOCA_CLUSTER__NAME__/$sanitized_cluster_name/g" /apps/soca/$SOCA_CONFIGURATION/cluster_web_ui/templates/common/horizontal_menu_bar.html
304304

305-
# Start Web UI
305+
# Install NodeJS/NPM if needed
306+
if [[ ! $(command -v npm) ]];
307+
then
308+
echo "npm not detected, installing it ... "
309+
export NVM_DIR="/root/nvm/$(date +%s)/.nvm"
310+
mkdir -p $NVM_DIR
311+
echo "Downloading $NVM_URL"
312+
wget "$NVM_URL"
313+
if [[ $(md5sum $NVM_INSTALL_SCRIPT | awk '{print $1}') != $NVM_HASH ]]; then
314+
echo -e "FATAL ERROR: Checksum for NVM failed. File may be compromised." > /etc/motd
315+
exit 1
316+
fi
317+
chmod +x $NVM_INSTALL_SCRIPT
318+
/bin/bash $NVM_INSTALL_SCRIPT
319+
source "$NVM_DIR/nvm.sh" # This loads nvm
320+
# shellcheck disable=SC1090
321+
source "$NVM_DIR/bash_completion"
322+
nvm install node
323+
fi
306324

307-
# Install required node module
325+
# Install required Node module
308326
npm install --prefix /apps/soca/"$SOCA_CONFIGURATION"/cluster_web_ui/static monaco-editor@0.24.0
309327

328+
# Start Web UI
310329
chmod +x /apps/soca/"$SOCA_CONFIGURATION"/cluster_web_ui/socawebui.sh
311330
/apps/soca/"$SOCA_CONFIGURATION"/cluster_web_ui/socawebui.sh start
312331

0 commit comments

Comments
 (0)