diff --git a/deployment/infra/conf/cloudinit-database.yml.tpl b/deployment/infra/conf/cloudinit-database.yml.tpl new file mode 100644 index 000000000..d16133a80 --- /dev/null +++ b/deployment/infra/conf/cloudinit-database.yml.tpl @@ -0,0 +1,94 @@ +#cloud-config +# sync with cloudinit-*.yml.tpl +runcmd: + - sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config + - mkdir -p /opt/nomad/volumes/certs + - chown -R nomad:nomad /opt/nomad/volumes + - systemctl stop systemd-resolved + - systemctl disable systemd-resolved + - systemctl enable nomad consul dnsmasq + - systemctl start nomad consul dnsmasq + - DD_AGENT_MAJOR_VERSION=7 DD_API_KEY=${datadog_api_key} DD_SITE="datadoghq.com" bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)" + - usermod -a -G docker dd-agent + - "echo \"dogstatsd_non_local_traffic: true\napm_config:\n apm_non_local_traffic: true\" >> /etc/datadog-agent/datadog.yaml" + - systemctl restart datadog-agent +apt: + sources: + hashicorp: + source: "deb [arch=amd64] https://apt.releases.hashicorp.com $RELEASE main" + keyid: E8A032E094D8EB4EA189D270DA418C88A3219F7B +packages: + - apt-transport-https + - nomad + - consul + - dnsmasq + - mariadb-client + - jq +write_files: + - path: /etc/dnsmasq.conf + content: | + local-service + no-resolv + server=/consul/127.0.0.1#8600 + server=185.12.64.1 + server=185.12.64.2 + address=/brawltime.ninja/10.0.0.2 + cache-size=65536 + - path: /etc/nomad.d/nomad.hcl + content: | + advertise { + http = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}" + rpc = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}" + serf = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}" + } + datacenter = "dc1" + data_dir = "/opt/nomad" + + server { + enabled = true + bootstrap_expect = 3 + } + + client { + enabled = true + network_interface = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"name\" }}" + + host_volume "database" { + path = "/opt/nomad/volumes/database" + read_only = false + } + + reserved { + reserved_ports = "22" + } + + node_class = "${class}" + } + + plugin "docker" { + config { + allow_privileged = true + } + } + + telemetry { + publish_allocation_metrics = true + publish_node_metrics = true + datadog_address = "localhost:8125" + disable_hostname = true + collection_interval = "10s" + } + - path: /etc/consul.d/consul.hcl + content: | + advertise_addr = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"address\" }}" + client_addr = "0.0.0.0" + datacenter = "dc1" + data_dir = "/opt/consul" + + ui_config { + enabled = true + } + + server = true + bootstrap_expect = 3 + retry_join = ["10.0.0.2"] diff --git a/deployment/infra/conf/cloudinit-ingress.yml.tpl b/deployment/infra/conf/cloudinit-ingress.yml.tpl index 11c0879ca..35e31d57c 100644 --- a/deployment/infra/conf/cloudinit-ingress.yml.tpl +++ b/deployment/infra/conf/cloudinit-ingress.yml.tpl @@ -1,5 +1,5 @@ #cloud-config -# sync with cloudinit-worker.yml.tpl +# sync with cloudinit-*.yml.tpl runcmd: - sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config - mkdir -p /opt/nomad/volumes/certs @@ -46,7 +46,7 @@ write_files: server { enabled = true - bootstrap_expect = 1 + bootstrap_expect = 3 } client { @@ -90,5 +90,4 @@ write_files: } server = true - bootstrap_expect = 1 - EOF + bootstrap_expect = 3 diff --git a/deployment/infra/conf/cloudinit-worker.yml.tpl b/deployment/infra/conf/cloudinit-worker.yml.tpl index d71635564..cb717a4be 100644 --- a/deployment/infra/conf/cloudinit-worker.yml.tpl +++ b/deployment/infra/conf/cloudinit-worker.yml.tpl @@ -1,5 +1,5 @@ #cloud-config -# sync with cloudinit-ingress.yml.tpl +# sync with cloudinit-*.yml.tpl runcmd: - sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config - mkdir -p /opt/nomad/volumes/certs @@ -44,11 +44,6 @@ write_files: enabled = true network_interface = "{{ GetPrivateInterfaces | include \"address\" \"10.0.0.*\" | attr \"name\" }}" - host_volume "certs" { - path = "/opt/nomad/volumes/certs" - read_only = false - } - reserved { reserved_ports = "22" } diff --git a/deployment/infra/hetzner.tf b/deployment/infra/hetzner.tf index 6f93dc783..ee437fca3 100644 --- a/deployment/infra/hetzner.tf +++ b/deployment/infra/hetzner.tf @@ -106,21 +106,25 @@ variable "servers" { class = "ingress" } colt = { - server_type = "cpx11" - class = "worker" + server_type = "cx21" + class = "database" } dynamike = { server_type = "cpx31" class = "database" } + /* edgar = { server_type = "cpx11" class = "worker" } + */ + /* frank = { server_type = "cpx11" class = "worker" } + */ /* gene = { server_type = "cpx11" @@ -167,7 +171,7 @@ resource "hcloud_server" "default" { server_type = each.value.server_type keep_disk = true ssh_keys = [hcloud_ssh_key.default.id] - user_data = templatefile(each.value.class == "ingress" ? "${path.module}/conf/cloudinit-ingress.yml.tpl" : "${path.module}/conf/cloudinit-worker.yml.tpl", { + user_data = templatefile("${path.module}/conf/cloudinit-${each.value.class}.yml.tpl", { class = each.value.class, datadog_api_key = var.datadog_api_key, }) diff --git a/deployment/infra/outputs.tf b/deployment/infra/outputs.tf index f1fb94618..4c11ac6a3 100644 --- a/deployment/infra/outputs.tf +++ b/deployment/infra/outputs.tf @@ -11,3 +11,7 @@ output "ingress_ssh_key" { output "public_ip4" { value = [for server in hcloud_server.default : server.ipv4_address] } + +output "network_id" { + value = hcloud_network.default.id +} diff --git a/deployment/jobs/autoscaler.nomad b/deployment/jobs/autoscaler.nomad index 73899b6c5..0d7befed8 100644 --- a/deployment/jobs/autoscaler.nomad +++ b/deployment/jobs/autoscaler.nomad @@ -1,6 +1,7 @@ variable "hcloud_token" {} variable "ssh_public_key_name" {} variable "datadog_api_key" {} +variable "brawltime_net_id" {} job "autoscaler" { datacenters = ["dc1"] @@ -29,12 +30,12 @@ job "autoscaler" { args = [ "-c", - "mkdir -p ${NOMAD_TASK_DIR}/plugins && cp ${NOMAD_TASK_DIR}/nomad-hcloud-autoscaler ${NOMAD_TASK_DIR}/plugins/hcloud-server && chmod +x ${NOMAD_TASK_DIR}/plugins/hcloud-server && nomad-autoscaler agent -config ${NOMAD_TASK_DIR}/config.hcl -plugin-dir ${NOMAD_TASK_DIR}/plugins -policy-dir ${NOMAD_TASK_DIR}/policies -http-bind-address 0.0.0.0 -http-bind-port ${NOMAD_PORT_http}", + "mkdir -p ${NOMAD_TASK_DIR}/plugins && cp ${NOMAD_TASK_DIR}/hcloud-server ${NOMAD_TASK_DIR}/plugins/hcloud-server && chmod +x ${NOMAD_TASK_DIR}/plugins/hcloud-server && nomad-autoscaler agent -config ${NOMAD_TASK_DIR}/config.hcl -plugin-dir ${NOMAD_TASK_DIR}/plugins -policy-dir ${NOMAD_TASK_DIR}/policies -http-bind-address 0.0.0.0 -http-bind-port ${NOMAD_PORT_http}", ] } artifact { - source = "https://github.com/AndrewChubatiuk/nomad-hcloud-autoscaler/releases/download/v0.0.2/nomad-hcloud-autoscaler" + source = "https://github.com/AndrewChubatiuk/nomad-hcloud-autoscaler/releases/download/v0.1.1/hcloud-server" } template { @@ -73,6 +74,8 @@ job "autoscaler" { # reduce load on Nomad and slow down scaling default_evaluation_interval = "1m" } + + log_level = "INFO" EOF destination = "${NOMAD_TASK_DIR}/config.hcl" @@ -96,12 +99,13 @@ job "autoscaler" { evaluation_interval = "10m" # TODO checks also takes database/ingress allocations into account + # TODO target-value assumes constant CPU/RAM per unit check "node-cpu" { source = "nomad-apm" query = "percentage-allocated_cpu" strategy "target-value" { - target = 80 + target = 90 } } @@ -110,20 +114,19 @@ job "autoscaler" { query = "percentage-allocated_memory" strategy "target-value" { - target = 80 + target = 90 } } # sync with hetzner.tf target "hcloud-server" { - # combined filters are only supported since Nov 2021 https://github.com/hashicorp/nomad-autoscaler/pull/535 - # the plugin was built Feb 2021 - #datacenter = "dc01" + datacenter = "dc01" node_class = "worker" node_purge = "true" - dry-run = "true" + #dry-run = "true" hcloud_location = "nbg1" hcloud_image = "docker-ce" + hcloud_group_id = "autoscale" hcloud_user_data = <<-EOOF ${ regex_replace( @@ -134,10 +137,11 @@ job "autoscaler" { } EOOF hcloud_ssh_keys = "${var.ssh_public_key_name}" + # scaling-intensive services (web, cube, render) have a 1:2 RAM:CPU ratio, so pick cpx11 over cx21 hcloud_server_type = "cpx11" - hcloud_name_prefix = "brawltime" hcloud_labels = "firewall=true,nomad_class=worker" - hcloud_networks = "brawltime-net" + # id of hetzner network brawltime-net + hcloud_networks = "${var.brawltime_net_id}" } } } @@ -150,8 +154,9 @@ job "autoscaler" { } resources { - cpu = 50 - memory = 128 + cpu = 64 + memory = 64 + memory_max = 128 } service { diff --git a/deployment/jobs/brawltime-clicker.nomad b/deployment/jobs/brawltime-clicker.nomad index 31b160759..a5aa8ac78 100644 --- a/deployment/jobs/brawltime-clicker.nomad +++ b/deployment/jobs/brawltime-clicker.nomad @@ -36,6 +36,7 @@ job "brawltime-clicker" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 100 @@ -48,6 +49,7 @@ job "brawltime-clicker" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 20 @@ -101,9 +103,9 @@ job "brawltime-clicker" { } resources { - cpu = 128 + cpu = 32 memory = 64 - memory_max = 96 + memory_max = 128 } } } diff --git a/deployment/jobs/brawltime-cube.nomad b/deployment/jobs/brawltime-cube.nomad index dedb82421..0c166eb94 100644 --- a/deployment/jobs/brawltime-cube.nomad +++ b/deployment/jobs/brawltime-cube.nomad @@ -8,9 +8,8 @@ variable "domain" { job "brawltime-cube" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" - operator = "regexp" value = "worker" } @@ -36,6 +35,7 @@ job "brawltime-cube" { source = "nomad-apm" group = "cpu-allocated-cube" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 100 @@ -49,6 +49,7 @@ job "brawltime-cube" { source = "nomad-apm" group = "cpu-allocated-cube" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 20 @@ -121,7 +122,7 @@ job "brawltime-cube" { } group "cube_refresh" { - count = 2 + count = 1 scaling { enabled = true @@ -133,6 +134,7 @@ job "brawltime-cube" { source = "nomad-apm" group = "cpu-allocated-refresh" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 100 @@ -146,6 +148,7 @@ job "brawltime-cube" { source = "nomad-apm" group = "cpu-allocated-refresh" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 20 @@ -179,7 +182,7 @@ job "brawltime-cube" { } resources { - cpu = 64 + cpu = 32 memory = 128 memory_max = 256 } diff --git a/deployment/jobs/brawltime-manager.nomad b/deployment/jobs/brawltime-manager.nomad index 3f61c98d0..8641bb233 100644 --- a/deployment/jobs/brawltime-manager.nomad +++ b/deployment/jobs/brawltime-manager.nomad @@ -11,10 +11,9 @@ variable "domain" { job "brawltime-manager" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" - operator = "regexp" - value = "worker" + value = "database" } update { @@ -39,6 +38,7 @@ job "brawltime-manager" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 100 @@ -52,6 +52,7 @@ job "brawltime-manager" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 20 diff --git a/deployment/jobs/brawltime-media.nomad b/deployment/jobs/brawltime-media.nomad index a4cff05a3..3bf2b25df 100644 --- a/deployment/jobs/brawltime-media.nomad +++ b/deployment/jobs/brawltime-media.nomad @@ -17,11 +17,12 @@ locals { job "brawltime-media" { datacenters = ["dc1"] +/* affinity { attribute = "${node.class}" - operator = "regexp" value = "worker" } +*/ # workaround for limited support for scaling jobs with single writer nodes # https://github.com/hashicorp/nomad/issues/10157 @@ -52,6 +53,7 @@ job "brawltime-media" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 100 @@ -65,6 +67,7 @@ job "brawltime-media" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 20 @@ -132,9 +135,9 @@ job "brawltime-media" { } resources { - cpu = 96 - memory = 196 - memory_max = 512 + cpu = 64 + memory = 384 + memory_max = 1024 } } @@ -195,9 +198,7 @@ job "brawltime-media" { } config { - # TODO when upgrading, solve https://bbs.archlinux.org/viewtopic.php?id=270005 - # key doesn't work with some clients - image = "atmoz/sftp:alpine-3.7" + image = "atmoz/sftp:alpine" args = ["brawlbot::${local.asset_uid}"] ports = ["ssh"] volumes = [ diff --git a/deployment/jobs/brawltime-render.nomad b/deployment/jobs/brawltime-render.nomad index 784fc2cd7..c72dca61e 100644 --- a/deployment/jobs/brawltime-render.nomad +++ b/deployment/jobs/brawltime-render.nomad @@ -36,6 +36,7 @@ job "brawltime-render" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 100 @@ -49,6 +50,7 @@ job "brawltime-render" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 20 diff --git a/deployment/jobs/brawltime-testing.nomad b/deployment/jobs/brawltime-testing.nomad index 099e2542f..451b4892c 100644 --- a/deployment/jobs/brawltime-testing.nomad +++ b/deployment/jobs/brawltime-testing.nomad @@ -9,9 +9,8 @@ variable "web_traduora_project_id" {} job "brawltime-testing" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" - operator = "regexp" value = "worker" } @@ -61,7 +60,7 @@ job "brawltime-testing" { } resources { - cpu = 256 + cpu = 128 memory = 196 memory_max = 512 } diff --git a/deployment/jobs/brawltime-web.nomad b/deployment/jobs/brawltime-web.nomad index 6e8cb7422..373194b51 100644 --- a/deployment/jobs/brawltime-web.nomad +++ b/deployment/jobs/brawltime-web.nomad @@ -10,9 +10,8 @@ variable "domain" { job "brawltime-web" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" - operator = "regexp" value = "worker" } @@ -38,6 +37,7 @@ job "brawltime-web" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 100 @@ -51,6 +51,7 @@ job "brawltime-web" { source = "nomad-apm" group = "cpu-allocated" query = "avg_cpu-allocated" + query_window = "10m" strategy "threshold" { upper_bound = 20 diff --git a/deployment/jobs/clickhouse.nomad b/deployment/jobs/clickhouse.nomad index f50903c15..b5cd5e8d9 100644 --- a/deployment/jobs/clickhouse.nomad +++ b/deployment/jobs/clickhouse.nomad @@ -1,7 +1,7 @@ job "clickhouse" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" value = "database" } @@ -58,7 +58,7 @@ job "clickhouse" { } config { - image = "yandex/clickhouse-server:21.12-alpine" + image = "clickhouse/clickhouse-server:22.4-alpine" network_mode = "host" volumes = [ diff --git a/deployment/jobs/cubestore.nomad b/deployment/jobs/cubestore.nomad index 775920230..343da1c65 100644 --- a/deployment/jobs/cubestore.nomad +++ b/deployment/jobs/cubestore.nomad @@ -1,7 +1,7 @@ job "cubestore" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" value = "database" } @@ -60,8 +60,8 @@ job "cubestore" { resources { cpu = 64 - memory = 512 - memory_max = 1024 + memory = 384 + memory_max = 512 } } diff --git a/deployment/jobs/mariadb.nomad b/deployment/jobs/mariadb.nomad index a3bdd02b4..3a02a6f30 100644 --- a/deployment/jobs/mariadb.nomad +++ b/deployment/jobs/mariadb.nomad @@ -1,7 +1,7 @@ job "mariadb" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" value = "database" } @@ -54,7 +54,7 @@ job "mariadb" { } resources { - cpu = 256 + cpu = 128 memory = 256 memory_max = 512 } diff --git a/deployment/jobs/nginx.nomad b/deployment/jobs/nginx.nomad index 2e956ad6e..33978ee34 100644 --- a/deployment/jobs/nginx.nomad +++ b/deployment/jobs/nginx.nomad @@ -105,8 +105,8 @@ job "nginx" { resources { cpu = 1536 - memory = 192 - memory_max = 512 + memory = 256 + memory_max = 1024 } } } diff --git a/deployment/jobs/redis.nomad b/deployment/jobs/redis.nomad index 262d82423..9613ddb30 100644 --- a/deployment/jobs/redis.nomad +++ b/deployment/jobs/redis.nomad @@ -1,7 +1,7 @@ job "redis" { datacenters = ["dc1"] - affinity { + constraint { attribute = "${node.class}" value = "database" } @@ -65,7 +65,7 @@ job "redis" { } resources { - cpu = 1024 + cpu = 256 memory = 512 memory_max = 1536 } diff --git a/deployment/jobs/traduora.nomad b/deployment/jobs/traduora.nomad index 77ccfda4d..00023896b 100644 --- a/deployment/jobs/traduora.nomad +++ b/deployment/jobs/traduora.nomad @@ -7,7 +7,7 @@ job "traduora" { affinity { attribute = "${node.class}" - value = "worker" + value = "database" } group "traduora" { @@ -69,8 +69,9 @@ job "traduora" { } resources { - cpu = 192 - memory = 128 + cpu = 128 + memory = 96 + memory_max = 256 } } } diff --git a/deployment/jobs/traefik.nomad b/deployment/jobs/traefik.nomad index d0e4ccede..abd00cac6 100644 --- a/deployment/jobs/traefik.nomad +++ b/deployment/jobs/traefik.nomad @@ -93,7 +93,7 @@ job "traefik" { } resources { - cpu = 1024 + cpu = 1536 memory = 256 memory_max = 512 }