Skip to content

Commit

Permalink
Merge pull request #2343 from consideRatio/pr/2i2c-aws-us-node-sharing
Browse files Browse the repository at this point in the history
2i2c-aws-us: k8s 1.25, highmem nodes, node sharing profile list, ssh-keys
  • Loading branch information
consideRatio authored Mar 13, 2023
2 parents 491b3ca + 717203e commit fec9a13
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 71 deletions.
197 changes: 197 additions & 0 deletions config/clusters/2i2c-aws-us/common.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,200 @@ basehub:
scheduling:
userScheduler:
enabled: true
singleuser:
profileList:
# NOTE: About node sharing
#
# CPU/Memory requests/limits are actively considered still. This
# profile list is setup to involve node sharing as considered in
# https://github.com/2i2c-org/infrastructure/issues/2121.
#
# - Memory requests are different from the description, based on:
# whats found to remain allocate in k8s, subtracting 1GiB
# overhead for misc system pods, and transitioning from GB in
# description to GiB in mem_guarantee.
# - CPU requests are lower than the description, with a factor of
# 10%.
#
- display_name: "Small: up to 4 CPU / 32 GB RAM"
description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type"
slug: small
default: true
profile_options:
requests:
# NOTE: Node share choices are in active development, see comment
# next to profileList: above.
display_name: Node share
choices:
mem_1:
default: true
display_name: ~1 GB, ~0.125 CPU
kubespawner_override:
mem_guarantee: 0.904G
cpu_guarantee: 0.013
mem_2:
display_name: ~2 GB, ~0.25 CPU
kubespawner_override:
mem_guarantee: 1.809G
cpu_guarantee: 0.025
mem_4:
display_name: ~4 GB, ~0.5 CPU
kubespawner_override:
mem_guarantee: 3.617G
cpu_guarantee: 0.05
mem_8:
display_name: ~8 GB, ~1.0 CPU
kubespawner_override:
mem_guarantee: 7.234G
cpu_guarantee: 0.1
mem_16:
display_name: ~16 GB, ~2.0 CPU
kubespawner_override:
mem_guarantee: 14.469G
cpu_guarantee: 0.2
mem_32:
display_name: ~32 GB, ~4.0 CPU
kubespawner_override:
mem_guarantee: 28.937G
cpu_guarantee: 0.4
kubespawner_override:
cpu_limit: null
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: r5.xlarge
- display_name: "Medium: up to 16 CPU / 128 GB RAM"
description: *profile_list_description
slug: medium
profile_options:
requests:
# NOTE: Node share choices are in active development, see comment
# next to profileList: above.
display_name: Node share
choices:
mem_1:
display_name: ~1 GB, ~0.125 CPU
kubespawner_override:
mem_guarantee: 0.942G
cpu_guarantee: 0.013
mem_2:
display_name: ~2 GB, ~0.25 CPU
kubespawner_override:
mem_guarantee: 1.883G
cpu_guarantee: 0.025
mem_4:
default: true
display_name: ~4 GB, ~0.5 CPU
kubespawner_override:
mem_guarantee: 3.766G
cpu_guarantee: 0.05
mem_8:
display_name: ~8 GB, ~1.0 CPU
kubespawner_override:
mem_guarantee: 7.532G
cpu_guarantee: 0.1
mem_16:
display_name: ~16 GB, ~2.0 CPU
kubespawner_override:
mem_guarantee: 15.064G
cpu_guarantee: 0.2
mem_32:
display_name: ~32 GB, ~4.0 CPU
kubespawner_override:
mem_guarantee: 30.128G
cpu_guarantee: 0.4
mem_64:
display_name: ~64 GB, ~8.0 CPU
kubespawner_override:
mem_guarantee: 60.257G
cpu_guarantee: 0.8
mem_128:
display_name: ~128 GB, ~16.0 CPU
kubespawner_override:
mem_guarantee: 120.513G
cpu_guarantee: 1.6
kubespawner_override:
cpu_limit: null
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: r5.4xlarge
- display_name: "Large: up to 64 CPU / 512 GB RAM"
description: *profile_list_description
slug: large
profile_options:
requests:
# NOTE: Node share choices are in active development, see comment
# next to profileList: above.
display_name: Node share
choices:
mem_4:
display_name: ~4 GB, ~0.5 CPU
kubespawner_override:
mem_guarantee: 3.821G
cpu_guarantee: 0.05
mem_8:
display_name: ~8 GB, ~1.0 CPU
kubespawner_override:
mem_guarantee: 7.643G
cpu_guarantee: 0.1
mem_16:
default: true
display_name: ~16 GB, ~2.0 CPU
kubespawner_override:
mem_guarantee: 15.285G
cpu_guarantee: 0.2
mem_32:
display_name: ~32 GB, ~4.0 CPU
kubespawner_override:
mem_guarantee: 30.571G
cpu_guarantee: 0.4
mem_64:
display_name: ~64 GB, ~8.0 CPU
kubespawner_override:
mem_guarantee: 61.141G
cpu_guarantee: 0.8
mem_128:
display_name: ~128 GB, ~16.0 CPU
kubespawner_override:
mem_guarantee: 122.282G
cpu_guarantee: 1.6
mem_256:
display_name: ~256 GB, ~32.0 CPU
kubespawner_override:
mem_guarantee: 244.565G
cpu_guarantee: 3.2
mem_512:
display_name: ~512 GB, ~64.0 CPU
kubespawner_override:
mem_guarantee: 489.13G
cpu_guarantee: 6.4
kubespawner_override:
cpu_limit: null
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: r5.16xlarge

- display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs
slug: gpu
description: "Start a container on a dedicated node with a GPU"
profile_options:
image:
display_name: Image
choices:
tensorflow:
display_name: Pangeo Tensorflow ML Notebook
slug: "tensorflow"
kubespawner_override:
node.kubernetes.io/instance-type: g4dn.xlarge
image: "pangeo/ml-notebook:b9584f6"
pytorch:
display_name: Pangeo PyTorch ML Notebook
default: true
slug: "pytorch"
kubespawner_override:
node.kubernetes.io/instance-type: g4dn.xlarge
image: "pangeo/pytorch-notebook:b9584f6"
kubespawner_override:
mem_limit: null
mem_guarantee: 14G
extra_resource_limits:
nvidia.com/gpu: "1"
58 changes: 0 additions & 58 deletions config/clusters/2i2c-aws-us/researchdelight.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,64 +42,6 @@ basehub:
# Temporarily set for *all* pods, including pods without any GPUs,
# to work around https://github.com/2i2c-org/infrastructure/issues/1530
NVIDIA_DRIVER_CAPABILITIES: compute,utility
profileList:
# The mem-guarantees are here so k8s doesn't schedule other pods
# on these nodes.
- display_name: "Small: m5.large"
description: "~2 CPU, ~8G RAM"
default: true
kubespawner_override:
# Explicitly unset mem_limit, so it overrides the default memory limit we set in
# basehub/values.yaml
mem_limit: 8G
mem_guarantee: 6.5G
node_selector:
node.kubernetes.io/instance-type: m5.large
- display_name: "Medium: m5.xlarge"
description: "~4 CPU, ~15G RAM"
kubespawner_override:
mem_limit: 15G
mem_guarantee: 12G
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "Large: m5.2xlarge"
description: "~8 CPU, ~30G RAM"
kubespawner_override:
mem_limit: 30G
mem_guarantee: 25G
node_selector:
node.kubernetes.io/instance-type: m5.2xlarge
- display_name: "Huge: m5.8xlarge"
description: "~16 CPU, ~60G RAM"
kubespawner_override:
mem_limit: 60G
mem_guarantee: 50G
node_selector:
node.kubernetes.io/instance-type: m5.8xlarge
- display_name: "Large + GPU"
description: "14GB RAM, 4 CPUs, T4 GPU"
profile_options:
image:
display_name: Image
choices:
tensorflow:
display_name: Pangeo Tensorflow ML Notebook
slug: "tensorflow"
kubespawner_override:
node.kubernetes.io/instance-type: g4dn.xlarge
image: "pangeo/ml-notebook:b9584f6"
pytorch:
display_name: Pangeo PyTorch ML Notebook
default: true
slug: "pytorch"
kubespawner_override:
node.kubernetes.io/instance-type: g4dn.xlarge
image: "pangeo/pytorch-notebook:b9584f6"
kubespawner_override:
mem_limit: null
mem_guarantee: 14G
extra_resource_limits:
nvidia.com/gpu: "1"
hub:
config:
Authenticator:
Expand Down
19 changes: 6 additions & 13 deletions eksctl/2i2c-aws-us.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,9 @@ local nodeAz = "us-west-2a";
// A `node.kubernetes.io/instance-type label is added, so pods
// can request a particular kind of node with a nodeSelector
local notebookNodes = [
{ instanceType: "m5.large" },
{ instanceType: "m5.xlarge" },
{ instanceType: "m5.2xlarge" },
{ instanceType: "m5.8xlarge" },
{ instanceType: "r5.xlarge" },
{ instanceType: "r5.4xlarge" },
{ instanceType: "r5.16xlarge" },
{
instanceType: "g4dn.xlarge",
tags+: {
Expand All @@ -44,10 +43,7 @@ local daskNodes = [
// *first* item in instanceDistribution.instanceTypes, to match
// what we do with notebook nodes. Pods can request a particular
// kind of node with a nodeSelector
{ instancesDistribution+: { instanceTypes: ["m5.large"] }},
{ instancesDistribution+: { instanceTypes: ["m5.xlarge"] }},
{ instancesDistribution+: { instanceTypes: ["m5.2xlarge"] }},
{ instancesDistribution+: { instanceTypes: ["m5.8xlarge"] }},
{ instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }},
];


Expand All @@ -57,10 +53,7 @@ local daskNodes = [
metadata+: {
name: "2i2c-aws-us",
region: clusterRegion,
// Warning: version 1.23 introduces some breaking changes
// Checkout the docs before upgrading
// ref: https://docs.aws.amazon.com/eks/latest/userguide/ebs-csi-migration-faq.html
version: '1.22'
version: '1.25'
},
availabilityZones: masterAzs,
iam: {
Expand Down Expand Up @@ -92,7 +85,7 @@ local daskNodes = [
ssh: {
publicKeyPath: 'ssh-keys/2i2c-aws-us.key.pub'
},
instanceType: "m5.xlarge",
instanceType: "r5.xlarge",
minSize: 1,
maxSize: 6,
labels+: {
Expand Down
1 change: 1 addition & 0 deletions eksctl/ssh-keys/2i2c-aws-us.key.pub
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDQcjyAtAy1dt/geq+lKfQ/EeUgZevd1LAbVMoT8x0BrhjiLZ+vK8YSTrIPvsGOziRCTtxudtufOCSh8fNW6+yq9Ano7OCFAADzVOoUtnzJkceUf9YhqV8RYEqrabc5/oxxK8ResOx1z4ibu3aL5jBWYxFKKJsM7hhe06IaJ6c71Vio3rh4MI2WFu5y8TzdR1jkfvQpouI5+DRVy/Nlc5M7K6DhpX6HVmTSDwd85TBNRzeCZkJ9yf4rsnokvJUFDw94FJN7x1yLzLg9uwb8Fmzp0WFc4WVy7ICn0O0JgXQ8FTxYxsJvPZo7eP6ib0nDo2csHMQZ+jGErWXUFY+GOJ8/Rba1jnMt0XkX3tG++7DGNhxCEz4zMthBG7dkRO9OYEEpRHJhAvwgyA55Ij1k5y2BOla+nDdwKhlabd5iCVo+wTOZO2Pz7iq4hI3Pa6zn+rC2zXu++UTiJj/E6CE0s4EQrp8OLkkWmkweIxihhxUBVzeUgWu57tEqjpUAfyhNcGk= erik@dl
21 changes: 21 additions & 0 deletions eksctl/ssh-keys/secret/2i2c-aws-us.key
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"data": "ENC[AES256_GCM,data:6i3uRRze7MnHVZ3RHR/i9VnABJ4TR1n192P7gu+YwSuTFIg9+fG9sj/IhU+OK2jkJf1w+160WoifILDXh92it9vQEPPurmsN8R0wvTd/XiGSsvbnWSxg4cS2NHzNBv6r4LftJgJ78E/uhKnNag0/2AI9Y3k3IZBHWRWNFsd/eazqXFM8zrF3K+mbqg1hm64tlQHbb2SQAGXt7x0tIL3ekkzB6+qQXc+jFqqh8eU5zLpprOiK6HanK+OpSeh6/3aMosndKPStnEZ1fwMcxVR6h0nk7VZlfK5NKIQjuQhISS6qarBGGTJYX+51zxS9T8AqUxwDLpJ7xQ68ixJFL6+CRWxceGsqjOd2J9j5/Jayl+3d62bumrI6idvoVzV2LiSMbsp5FQICE6wEtLzKd3YGAQZROAXSMXmHaz022G4bib6EvtB9RpgJDVKHyCfU7Dp0RqkLQhDCEcJJMyQZ1nWpLJlDDGuPYT6iujqkovCqH6LxvLrPNEVp+8pxTKaogt8nm33YlXgwxj/KqzDY02TnTWgWwL3GoBHW1Udg7Kxk2uYGhLcZlelIZErX24W79gSIXVx+mMVlqu+zpqXgrf45v5r6yg/1UGjmERUidMERWcJg++c53ermoSFMGUki7WRWK1Md52A1UVgUxqnDpRSBUTRoAnUU91oXv+oSCZFbdfYC9q6M0RA23HMkC837bLr6S+cswOwf/E3Fd4stozpcxEq4ocEb3lBwix4tblimLTxgqRpCHPCQgCgWgMMWQHDzM6yNVxyrTMK8ljxsY8DAQwMV52VT5LaSSBCzK074T4BoqbcGw4N5098bR0XdfoOEfXPkki/FyUkuRRz2dStCOTzWIIv3mbLENEyoGyDFOqhXCBLV3Lqm7nCTjJX0yaIDPO6jF8L7fThUPQN6rPznVgH8Dvi3iR71l9ALSfQF1+EtD+10cT6bHKU5aGjeuAgxFa/q5V/NkhY6cUR4S9xO5BB/piOyWehpP4hR0b7mGKoH2CD0Wvot2hMBywITVwRAaOcmWhGGw6B1W+jeMAAYgdKtJ18c+Nm8N+3M+rpIdLVyaTffGZxELAhM1SYTqjBj1IUZ3HzqmwDzu444o4nhktKMWHgdmtbbdhi3b30LzuAejfhY3T3hKgt2FqjycFfZ+9Oydt4LP21HW+Q9Xr2yzptXXOThC6/iLthKpHueL6PVGON6eOypOl0depKlNzFu3Se1u76BXaNvJ+KaQlAS7AdPnDR2D3naXmnp/XLyk+KKbTUI60P+ZnEhhYJp0BNHjoI5b6goknU7GKJzCwKyW3B47dR/vm82MohI8b+ztyh8iJYDsUluIUfSBIo+/2rRulB5yHUVXMjVnCVy0ly7wTxrZ/RjOSAwDgYky8O0gUBkA5TXMLaOb8o5ivKLQLlYX5jOroe37tbI2v9RDIwYfzGK1r7q4V26GP5IxyqRaWGWLrQJZtjMvpek1M5+rIR9GppfitSh21lH3Gru9Jt+O9UyRB8dFlqm3S1wxYOanW0ltqPiExUoVBTLLHbU/PGZwlVSAhTQo8g0dkKSvYxnXa4kraJ3g9b9y275+Z/0NV8QZf1cGDSGTYEw626iXq7CwY5nw95x9IjrIIe/XaPHErL+dcda1k7/YUOPEgb0C/r5bZRmTgURXVpbAsKrOpVaKT2px08nNDkEzAqdyd3pcVsI2niy7UgRQS9J6SdD3JpCRdjh5RTCypdm/b2XMLGGqIi/NYWwf9IK1TR5XtjpoAUCwJ12tY79GPzC8mGBFsYebaPMF8HJtGkamzlJMVAIxzlJr7dbKscdU0DQrxMG9avmSq4UotEOWWQA1uhhGnEeCFeIUI1+a3k/9wCmR4JzMF2f1uc9o8LK/+VkVQp0OmMHPEXilMo0UX+m/gQ/lFzbjXgsPPGuelujcIurkfDZrXn12eQkq0dQcYBosztpcqBbeTRCScPl6GTqF7lsP445NDF/L+Wv0mz5oje7qs0zgUIiC0v9QzprvlmophFeUhNcYEVJ/BrzO13BLtAI+Q9Jkpo6VNiWSdJsR24nWwcbaRrL7WEOh9hkGvVvZ/SOI7x6MhMExvj5rXlvp30fbXw1Qnt6Rhzt1JlIvTbbCGOvpH1sdeuHa+th9tjsjQGVBrdqjSCgtwLz/uTWf5I4gN6Eu4f7SZhuFf09NS+cuHIQxpIDsJ0cRO6PKrNb5QN4fxqCSpMl243ZOo3Ffg9AQO3yMNmHfeFqUIhQV5pGGIgZhAQ/ReJ2wTEsQ3vncLFmoCZJUMs35tiTR0dGXWH3/f4G76TMc+9go/sSRO81SpGbsO7iKWeI1nt2gaTeeJZNYBMHURp8+otTNwixyXelRfc9SDNG6fKuMrO7PcHogVogdyLGpwVwsGI6hwudnKk9FwaSkGYIickkH0us3GvUoXU6D+niTUI1FXE4N3d1HvG1xre/mut1e/WCmWvszwi3Qd/jZzoy0szwAH8Xb0Cj/EC+O+VwRuo1VbpX09gCVYIOA3hRMRSCu8xCbUK9Otmg93HDPsgFcAGHxsTXK6dLWK4/BMKK4AdINaZ/V1iheP0CC7pDhiMXqeZ+5hMIfo8UnEPh3hFFBlNrLGoad3/KoTL3Zdq9d3KSsv/3PB6D68lzhTibWIbJytDi1nfg8JhmWAh7KnVR4AhgAu9OF6frn3jUkHJpiW0QDbwGS5gTsd/9qinXyICMtlmWKs98Xz/NWCl9P0eSJMwG1B9fOVCCQCIjw3jzscKm+UY3ph+D5j7Chz6TDAZSmcUy2qBEDOPSzmzr0fhoHqgV0W+Lu776EoDl8PnWXNlyhPXrfweZ2M6I5h0D4aOmfuYX5/GnTNm2YH4AXc5nIpaAeTYh89YS0+JXpfVdJZk7pq8R1EaiuH3aiLqd+A/uAvwyUrNiif2rjY4hqu1hZ30vFlAiL5u8FtkDtsP5OgSQrojbPi+XYKX+QobF2RzzHpIRhyQHaPJlOHQ4NcKGKIJj6vi560EVTGspYb8mrPFvgV7wYY0QXXRwA8KIjv4sGIs30CgeyRWSxm8VUbKgzX+2cqe8Ynons/SNx9wLXajED+vM6Ss4yjTN2zwhHDKmkwzHakIWNxDAZ9EcUwe1YJKCWXMrDvOzvCNiRdTR2VbevBe64ngd0KuIFRHCtngrilJHdXjZ1lLvhYmgTK2rr9VjCu+4LeSMfNNDOELcbHAIn4toX55X3F9w4HNvxogsIgotYrMAuJxZghlyFUYeco85XCKeUiAbo9HO01aEWWqivWYPZeVRi57XP8tegZtPMiPhjUkSaFiK+KPbwoc+iLO+D10METHsqFGVltgCeqcQRHfOf4gNeHPKtyRR85n918U4b6kZ1rdYyG2BtfkK8iWRDLM9QLveJqjD2EoJWegmZvQqSJIQGlKQzmjyF/99V9Ba1LFI/VU/wm2Yz+eNmV2j1v4X1E+An+zpVS0mkLQ656dHIPMg6Q==,iv:MdZKij6wQXdl0gVbAkOFqLuiCEHesv+ZLVeoK3zKee8=,tag:3N/UVnR2vAqkoaJMwq+EmA==,type:str]",
"sops": {
"kms": null,
"gcp_kms": [
{
"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs",
"created_at": "2023-03-13T01:03:44Z",
"enc": "CiUA4OM7eDPJQVDE7NEVqjWdmXpOQTl+7QK5hGpY3AdSZ3yEw45JEkkALQgViChPhBj8tmk9q6QdVCMb4VivkTQireyREkccm5p2zKgsPCOXz/xmV8FoFKOSv1MEcOjRSR2jyAzAZ7v6+1cpSkJbqVvQ"
}
],
"azure_kv": null,
"hc_vault": null,
"age": null,
"lastmodified": "2023-03-13T01:03:45Z",
"mac": "ENC[AES256_GCM,data:YP8wuY4D1dYvWgSPLySqXnZAtmDC3Ti8fBILmu4uUXw+rpYu2tkuhMUYDaWtfSm7K4wZXkWgHkOyn1PRuI8d6SZxscup0rgGeGM9M7dNFoGQtKec0JMLpkmUdnTBqJQ/9U4agp/4CcqAryYn1pIIqmf9cYd/PiCl/uiIzuDtTJY=,iv:wSsby3xmTALAKzZ08M2Y+gGf/2SZelb5V7sdnaTjG8Y=,tag:QJmC828jRr+i52bpuJDNUQ==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.7.2"
}
}

0 comments on commit fec9a13

Please sign in to comment.