Skip to content

Commit

Permalink
deployment: Use autoscaling workers
Browse files Browse the repository at this point in the history
  • Loading branch information
schneefux committed Apr 30, 2022
1 parent 3239377 commit 6ed748a
Show file tree
Hide file tree
Showing 20 changed files with 172 additions and 61 deletions.
94 changes: 94 additions & 0 deletions deployment/infra/conf/cloudinit-database.yml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#cloud-config
# sync with cloudinit-*.yml.tpl
runcmd:
- sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config
- mkdir -p /opt/nomad/volumes/certs
- chown -R nomad:nomad /opt/nomad/volumes
- systemctl stop systemd-resolved
- systemctl disable systemd-resolved
- systemctl enable nomad consul dnsmasq
- systemctl start nomad consul dnsmasq
- DD_AGENT_MAJOR_VERSION=7 DD_API_KEY=${datadog_api_key} DD_SITE="datadoghq.com" bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)"
- usermod -a -G docker dd-agent
- "echo \"dogstatsd_non_local_traffic: true\napm_config:\n apm_non_local_traffic: true\" >> /etc/datadog-agent/datadog.yaml"
- systemctl restart datadog-agent
apt:
sources:
hashicorp:
source: "deb [arch=amd64] https://apt.releases.hashicorp.com $RELEASE main"
keyid: E8A032E094D8EB4EA189D270DA418C88A3219F7B
packages:
- apt-transport-https
- nomad
- consul
- dnsmasq
- mariadb-client
- jq
write_files:
- path: /etc/dnsmasq.conf
content: |
local-service
no-resolv
server=/consul/127.0.0.1#8600
server=185.12.64.1
server=185.12.64.2
address=/brawltime.ninja/10.0.0.2
cache-size=65536
- path: /etc/nomad.d/nomad.hcl
content: |
advertise {
http = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}"
rpc = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}"
serf = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}"
}
datacenter = "dc1"
data_dir = "/opt/nomad"

server {
enabled = true
bootstrap_expect = 3
}

client {
enabled = true
network_interface = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"name\" }}"
host_volume "database" {
path = "/opt/nomad/volumes/database"
read_only = false
}

reserved {
reserved_ports = "22"
}

node_class = "${class}"
}

plugin "docker" {
config {
allow_privileged = true
}
}

telemetry {
publish_allocation_metrics = true
publish_node_metrics = true
datadog_address = "localhost:8125"
disable_hostname = true
collection_interval = "10s"
}
- path: /etc/consul.d/consul.hcl
content: |
advertise_addr = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}"
client_addr = "0.0.0.0"
datacenter = "dc1"
data_dir = "/opt/consul"
ui_config {
enabled = true
}

server = true
bootstrap_expect = 3
retry_join = ["10.0.0.2"]
7 changes: 3 additions & 4 deletions deployment/infra/conf/cloudinit-ingress.yml.tpl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#cloud-config
# sync with cloudinit-worker.yml.tpl
# sync with cloudinit-*.yml.tpl
runcmd:
- sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config
- mkdir -p /opt/nomad/volumes/certs
Expand Down Expand Up @@ -46,7 +46,7 @@ write_files:

server {
enabled = true
bootstrap_expect = 1
bootstrap_expect = 3
}

client {
Expand Down Expand Up @@ -90,5 +90,4 @@ write_files:
}

server = true
bootstrap_expect = 1
EOF
bootstrap_expect = 3
7 changes: 1 addition & 6 deletions deployment/infra/conf/cloudinit-worker.yml.tpl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#cloud-config
# sync with cloudinit-ingress.yml.tpl
# sync with cloudinit-*.yml.tpl
runcmd:
- sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config
- mkdir -p /opt/nomad/volumes/certs
Expand Down Expand Up @@ -44,11 +44,6 @@ write_files:
enabled = true
network_interface = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"name\" }}"
host_volume "certs" {
path = "/opt/nomad/volumes/certs"
read_only = false
}

reserved {
reserved_ports = "22"
}
Expand Down
10 changes: 7 additions & 3 deletions deployment/infra/hetzner.tf
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,25 @@ variable "servers" {
class = "ingress"
}
colt = {
server_type = "cpx11"
class = "worker"
server_type = "cx21"
class = "database"
}
dynamike = {
server_type = "cpx31"
class = "database"
}
/*
edgar = {
server_type = "cpx11"
class = "worker"
}
*/
/*
frank = {
server_type = "cpx11"
class = "worker"
}
*/
/*
gene = {
server_type = "cpx11"
Expand Down Expand Up @@ -167,7 +171,7 @@ resource "hcloud_server" "default" {
server_type = each.value.server_type
keep_disk = true
ssh_keys = [hcloud_ssh_key.default.id]
user_data = templatefile(each.value.class == "ingress" ? "${path.module}/conf/cloudinit-ingress.yml.tpl" : "${path.module}/conf/cloudinit-worker.yml.tpl", {
user_data = templatefile("${path.module}/conf/cloudinit-${each.value.class}.yml.tpl", {
class = each.value.class,
datadog_api_key = var.datadog_api_key,
})
Expand Down
4 changes: 4 additions & 0 deletions deployment/infra/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@ output "ingress_ssh_key" {
output "public_ip4" {
value = [for server in hcloud_server.default : server.ipv4_address]
}

output "network_id" {
value = hcloud_network.default.id
}
29 changes: 17 additions & 12 deletions deployment/jobs/autoscaler.nomad
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
variable "hcloud_token" {}
variable "ssh_public_key_name" {}
variable "datadog_api_key" {}
variable "brawltime_net_id" {}

job "autoscaler" {
datacenters = ["dc1"]
Expand Down Expand Up @@ -29,12 +30,12 @@ job "autoscaler" {

args = [
"-c",
"mkdir -p ${NOMAD_TASK_DIR}/plugins && cp ${NOMAD_TASK_DIR}/nomad-hcloud-autoscaler ${NOMAD_TASK_DIR}/plugins/hcloud-server && chmod +x ${NOMAD_TASK_DIR}/plugins/hcloud-server && nomad-autoscaler agent -config ${NOMAD_TASK_DIR}/config.hcl -plugin-dir ${NOMAD_TASK_DIR}/plugins -policy-dir ${NOMAD_TASK_DIR}/policies -http-bind-address 0.0.0.0 -http-bind-port ${NOMAD_PORT_http}",
"mkdir -p ${NOMAD_TASK_DIR}/plugins && cp ${NOMAD_TASK_DIR}/hcloud-server ${NOMAD_TASK_DIR}/plugins/hcloud-server && chmod +x ${NOMAD_TASK_DIR}/plugins/hcloud-server && nomad-autoscaler agent -config ${NOMAD_TASK_DIR}/config.hcl -plugin-dir ${NOMAD_TASK_DIR}/plugins -policy-dir ${NOMAD_TASK_DIR}/policies -http-bind-address 0.0.0.0 -http-bind-port ${NOMAD_PORT_http}",
]
}

artifact {
source = "https://github.com/AndrewChubatiuk/nomad-hcloud-autoscaler/releases/download/v0.0.2/nomad-hcloud-autoscaler"
source = "https://github.com/AndrewChubatiuk/nomad-hcloud-autoscaler/releases/download/v0.1.1/hcloud-server"
}

template {
Expand Down Expand Up @@ -73,6 +74,8 @@ job "autoscaler" {
# reduce load on Nomad and slow down scaling
default_evaluation_interval = "1m"
}
log_level = "INFO"
EOF

destination = "${NOMAD_TASK_DIR}/config.hcl"
Expand All @@ -96,12 +99,13 @@ job "autoscaler" {
evaluation_interval = "10m"
# TODO checks also takes database/ingress allocations into account
# TODO target-value assumes constant CPU/RAM per unit
check "node-cpu" {
source = "nomad-apm"
query = "percentage-allocated_cpu"
strategy "target-value" {
target = 80
target = 90
}
}
Expand All @@ -110,20 +114,19 @@ job "autoscaler" {
query = "percentage-allocated_memory"
strategy "target-value" {
target = 80
target = 90
}
}
# sync with hetzner.tf
target "hcloud-server" {
# combined filters are only supported since Nov 2021 https://github.com/hashicorp/nomad-autoscaler/pull/535
# the plugin was built Feb 2021
#datacenter = "dc01"
datacenter = "dc01"
node_class = "worker"
node_purge = "true"
dry-run = "true"
#dry-run = "true"
hcloud_location = "nbg1"
hcloud_image = "docker-ce"
hcloud_group_id = "autoscale"
hcloud_user_data = <<-EOOF
${
regex_replace(
Expand All @@ -134,10 +137,11 @@ job "autoscaler" {
}
EOOF
hcloud_ssh_keys = "${var.ssh_public_key_name}"
# scaling-intensive services (web, cube, render) have a 1:2 RAM:CPU ratio, so pick cpx11 over cx21
hcloud_server_type = "cpx11"
hcloud_name_prefix = "brawltime"
hcloud_labels = "firewall=true,nomad_class=worker"
hcloud_networks = "brawltime-net"
# id of hetzner network brawltime-net
hcloud_networks = "${var.brawltime_net_id}"
}
}
}
Expand All @@ -150,8 +154,9 @@ job "autoscaler" {
}

resources {
cpu = 50
memory = 128
cpu = 64
memory = 64
memory_max = 128
}

service {
Expand Down
6 changes: 4 additions & 2 deletions deployment/jobs/brawltime-clicker.nomad
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ job "brawltime-clicker" {
source = "nomad-apm"
group = "cpu-allocated"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 100
Expand All @@ -48,6 +49,7 @@ job "brawltime-clicker" {
source = "nomad-apm"
group = "cpu-allocated"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 20
Expand Down Expand Up @@ -101,9 +103,9 @@ job "brawltime-clicker" {
}

resources {
cpu = 128
cpu = 32
memory = 64
memory_max = 96
memory_max = 128
}
}
}
Expand Down
11 changes: 7 additions & 4 deletions deployment/jobs/brawltime-cube.nomad
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ variable "domain" {
job "brawltime-cube" {
datacenters = ["dc1"]

affinity {
constraint {
attribute = "${node.class}"
operator = "regexp"
value = "worker"
}

Expand All @@ -36,6 +35,7 @@ job "brawltime-cube" {
source = "nomad-apm"
group = "cpu-allocated-cube"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 100
Expand All @@ -49,6 +49,7 @@ job "brawltime-cube" {
source = "nomad-apm"
group = "cpu-allocated-cube"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 20
Expand Down Expand Up @@ -121,7 +122,7 @@ job "brawltime-cube" {
}

group "cube_refresh" {
count = 2
count = 1

scaling {
enabled = true
Expand All @@ -133,6 +134,7 @@ job "brawltime-cube" {
source = "nomad-apm"
group = "cpu-allocated-refresh"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 100
Expand All @@ -146,6 +148,7 @@ job "brawltime-cube" {
source = "nomad-apm"
group = "cpu-allocated-refresh"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 20
Expand Down Expand Up @@ -179,7 +182,7 @@ job "brawltime-cube" {
}

resources {
cpu = 64
cpu = 32
memory = 128
memory_max = 256
}
Expand Down
7 changes: 4 additions & 3 deletions deployment/jobs/brawltime-manager.nomad
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ variable "domain" {
job "brawltime-manager" {
datacenters = ["dc1"]

affinity {
constraint {
attribute = "${node.class}"
operator = "regexp"
value = "worker"
value = "database"
}

update {
Expand All @@ -39,6 +38,7 @@ job "brawltime-manager" {
source = "nomad-apm"
group = "cpu-allocated"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 100
Expand All @@ -52,6 +52,7 @@ job "brawltime-manager" {
source = "nomad-apm"
group = "cpu-allocated"
query = "avg_cpu-allocated"
query_window = "10m"

strategy "threshold" {
upper_bound = 20
Expand Down
Loading

0 comments on commit 6ed748a

Please sign in to comment.