Skip to content

Fix amazonlinux2 and add more features #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
1. Creates `N` nodes in `M` subnets
1. Creates Autoscaling Group and ELB to load balance nodes
1. Makes sure nodes can talk to each other and create cluster
1. Make sure new nodes always join the cluster
1. Make sure new nodes attempt to join the cluster at startup
1. Configures `/` vhost queues in High Available (Mirrored) mode with automatic synchronization (`"ha-mode":"all", "ha-sync-mode":"3"`)


Expand Down Expand Up @@ -44,3 +44,8 @@ it will update Autoscaling Group and add `2` nodes more. Dead simple.
Node becomes unresponsive ? Autoscaling group and ELB Health Checks will automatically replace it with new one, without data loss.

Note: The VPC must have `enableDnsHostnames` = `true` and `enableDnsSupport` = `true` for the private DNS names to be resolvable for the nodes to connect to each other.


## Debugging
If you can SSH onto one of the nodes you can run:
`docker exec rabbitmq rabbitmqctl cluster_status` to see the cluster status of that node.
128 changes: 119 additions & 9 deletions cloud-init.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,110 @@ write_files:
[ { rabbit, [
{ loopback_users, [ ] } ] }
].
- path: /etc/sysconfig/docker
content: |
# The max number of open files for the daemon itself, and all
# running containers. The default value of 1048576 mirrors the value
# used by the systemd service unit.
DAEMON_MAXFILES=1048576

# Additional startup options for the Docker daemon, for example:
# OPTIONS="--ip-forward=true --iptables=true"
# By default we limit the number of open files per container
OPTIONS="--default-ulimit nofile=1000000:1048576"

# How many seconds the sysvinit script waits for the pidfile to appear
# when starting the daemon.
DAEMON_PIDFILE_TIMEOUT=10
- path: /opt/aws/amazon-cloudwatch-agent/bin/config.json
content: |
{
"agent": {
"metrics_collection_interval": 30,
"run_as_user": "cwagent"
},
"logs": {
"logs_collected": {
"files": {
"collect_list": [
{
"file_path": "/var/log/cloud-init.log",
"log_group_name": "cloud-init.log",
"log_stream_name": "{instance_id}/cloud-init.log"
},
{
"file_path": "cloud-init-output.log",
"log_group_name": "cloud-init-output.log",
"log_stream_name": "{instance_id}/cloud-init-output.log"
}
]
}
}
},
"metrics": {
"append_dimensions": {
"AutoScalingGroupName": "$${aws:AutoScalingGroupName}",
"ImageId": "$${aws:ImageId}",
"InstanceId": "$${aws:InstanceId}",
"InstanceType": "$${aws:InstanceType}"
},
"metrics_collected": {
"cpu": {
"measurement": [
"cpu_usage_idle",
"cpu_usage_iowait",
"cpu_usage_user",
"cpu_usage_system"
],
"metrics_collection_interval": 30,
"totalcpu": false
},
"disk": {
"measurement": [
"used_percent",
"inodes_free"
],
"metrics_collection_interval": 30,
"resources": [
"*"
]
},
"diskio": {
"measurement": [
"io_time",
"write_bytes",
"read_bytes",
"writes",
"reads"
],
"metrics_collection_interval": 30,
"resources": [
"*"
]
},
"mem": {
"measurement": [
"mem_used_percent"
],
"metrics_collection_interval": 30
},
"netstat": {
"measurement": [
"tcp_established",
"tcp_time_wait"
],
"metrics_collection_interval": 30
},
"swap": {
"measurement": [
"swap_used_percent"
],
"metrics_collection_interval": 30
}
}
}
}

- path: /root/find_hosts.sh
content: |
#!/usr/bin/env bash
Expand All @@ -16,15 +120,14 @@ write_files:

DNSES=$(aws ec2 describe-instances --filters "Name=tag:aws:autoscaling:groupName,Values=${asg_name}" "Name=instance-state-name,Values=running" | jq ".Reservations[].Instances[].PrivateDnsName" | xargs)

HOSTNAMES=()
for dns in $DNSES; do
hostname=($${dns//./ })
if [ "$hostname" != "$HOSTNAME" ]; then
HOSTNAMES+=( $hostname )
fi
# pulling out just the first part of the name, eg: ip-10-2-1-82.ec2.internal -> ip-10-2-1-82
dns_subdomain=($${dns//./ })
if [ "$dns" != "$HOSTNAME" ] && [ "$dns_subdomain" != "$HOSTNAME" ] ; then
echo $dns_subdomain
fi
done

echo $HOSTNAMES
- path: /root/bin/join_cluster.sh
content: |
#!/usr/bin/env sh
Expand All @@ -33,19 +136,22 @@ write_files:

for run in {1..3}; do
sleep $[ ( $RANDOM % 10 ) + 1 ]s
echo "stopping rabbit to try and join other nodes"
rabbitmqctl stop_app

NEW_HOSTNAMES=()
for hostname in $HOSTNAMES; do
rabbitmqctl join_cluster rabbit@$hostname
for peerhostname in $HOSTNAMES; do
echo "trying to join $${peerhostname}"
rabbitmqctl join_cluster rabbit@$peerhostname
st=$?
if [ $st -ne 0 ] && [ $st -ne 130 ]; then # 130 is "already joined"
NEW_HOSTNAMES+=( $hostname )
NEW_HOSTNAMES+=( $peerhostname )
fi
done

HOSTNAMES=( $${NEW_HOSTNAMES[@]} )
rabbitmqctl start_app
echo "startting rabbit after trying to join other nodes"

if [ $${#HOSTNAMES[@]} -eq 0 ]; then
exit 0
Expand All @@ -66,12 +172,16 @@ write_files:
runcmd:
- yum update -y
- yum install -y docker jq
- wget https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm
- sudo rpm -U ./amazon-cloudwatch-agent.rpm
- sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json -s
- service docker start
- chkconfig docker on
- usermod -a -G docker ec2-user
- $(aws ecr get-login --no-include-email --region ${region} --registry-ids ${ecr_registry_id})
- docker run -d --name rabbitmq --hostname $HOSTNAME --log-driver=awslogs --log-opt awslogs-region=${region} --log-opt awslogs-group=${cw_log_group} -p 4369:4369 -p 5672:5672 -p 15672:15672 -p 25672:25672 -e RABBITMQ_ERLANG_COOKIE='${secret_cookie}' -v /root/data:/var/lib/rabbitmq -v /root/conf/:/etc/rabbitmq -v /root/bin:/tmp/bin ${rabbitmq_image}
- sleep 1
- bash /root/find_hosts.sh
- docker exec rabbitmq bash /tmp/bin/join_cluster.sh $(bash /root/find_hosts.sh)
- sleep 1
- bash /root/configure.sh
24 changes: 21 additions & 3 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ data "template_file" "cloud-init" {
rabbitmq_image = var.rabbitmq_image
ecr_registry_id = var.ecr_registry_id
cw_log_group = aws_cloudwatch_log_group.log_group.name
cw_log_stream = local.cluster_name
}
}

Expand Down Expand Up @@ -143,7 +144,8 @@ resource "aws_security_group" "rabbitmq_elb" {
}

tags = {
Name = "rabbitmq ${var.name} ELB"
Name = "rabbitmq ${var.name} ELB"
Terraform = true
}
}

Expand All @@ -167,6 +169,7 @@ resource "aws_security_group" "rabbitmq_nodes" {
}

ingress {
description = "management port"
protocol = "tcp"
from_port = 15672
to_port = 15672
Expand All @@ -184,7 +187,8 @@ resource "aws_security_group" "rabbitmq_nodes" {
}

tags = {
Name = "rabbitmq ${var.name} nodes"
Name = "rabbitmq ${var.name} nodes"
Terraform = true
}
}

Expand Down Expand Up @@ -232,6 +236,12 @@ resource "aws_autoscaling_group" "rabbitmq" {
value = "enabled"
propagate_at_launch = true
}

tag {
key = "Terraform"
value = true
propagate_at_launch = true
}
}

resource "aws_elb" "elb" {
Expand Down Expand Up @@ -264,8 +274,16 @@ resource "aws_elb" "elb" {
internal = true
security_groups = flatten([aws_security_group.rabbitmq_elb.id, var.elb_additional_security_group_ids])

access_logs {
bucket = var.access_log_bucket
bucket_prefix = var.access_log_bucket_prefix
interval = var.access_log_interval
enabled = var.access_logs_enabled
}

tags = {
Name = local.cluster_name
Name = local.cluster_name
Terraform = true
}
}

18 changes: 18 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,22 @@ variable "ecr_registry_id" {
variable "log_retention_in_days" {
type = string
default = 365
}
variable "access_log_bucket" {
type = string
default = "bucketname"
description = "optional bucket name to use for access logs"
}
variable "access_log_bucket_prefix" {
type = string
default = ""
description = "optional prefix to use for access logs"
}
variable "access_log_interval" {
type = string
default = 60
}
variable "access_logs_enabled" {
type = bool
default = false
}