Skip to content

Commit

Permalink
MMSYS-1722 refactoring, documentation extended, license and maintaine…
Browse files Browse the repository at this point in the history
…rs files added
charlie4gfk committed Dec 14, 2020
1 parent fd7beda commit 6db8135
Showing 19 changed files with 217 additions and 139 deletions.
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020 GfK SE

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
3 changes: 3 additions & 0 deletions MAINTEINERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Ilya Isakov <ilya.isakov@gfk.com>
Can Suvari <can.suvari@gfk.com>
Jakub Berezowski <jakub.berezowski@gfk.com>
5 changes: 5 additions & 0 deletions acm.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
data "aws_acm_certificate" "this" {
domain = var.certificate_domain_name
types = ["AMAZON_ISSUED"]
most_recent = true
}
23 changes: 1 addition & 22 deletions alb.tf
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
### SSL Cert ###
data "aws_acm_certificate" "this" {
domain = var.certificate_domain_name
types = ["AMAZON_ISSUED"]
most_recent = true
}

### Loadbalancer ###
# TODO(ilya_isakov): change backend protocol to HTTPS
module "aws_alb" {
source = "terraform-aws-modules/alb/aws"
@@ -67,17 +59,4 @@ module "aws_alb" {
]

tags = var.tags
}

### DNS record ###
resource "aws_route53_record" "this" {
zone_id = var.dns_zone_id
name = var.name
type = "A"

alias {
name = module.aws_alb.this_lb_dns_name
zone_id = module.aws_alb.this_lb_zone_id
evaluate_target_health = true
}
}
}
5 changes: 5 additions & 0 deletions cloudwatch.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
resource "aws_cloudwatch_log_group" "ecs_cloudwatch_logs" {
name = "/ecs/${var.name}"
retention_in_days = var.cloudwatch_retention
tags = var.tags
}
30 changes: 30 additions & 0 deletions data_sync_dags.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
resource "aws_datasync_location_s3" "dag_source" {
s3_bucket_arn = data.aws_s3_bucket.dags_bucket.arn
subdirectory = "/${var.dag_s3_key}"

s3_config {
bucket_access_role_arn = aws_iam_role.dags-datasync-task-role.arn
}

tags = var.tags
}

resource "aws_datasync_location_efs" "dag_destination" {
efs_file_system_arn = module.efs.arn
subdirectory = "/usr/local/airflow/dags"

ec2_config {
security_group_arns = [aws_security_group.sg_airflow_internal.arn]
subnet_arn = data.aws_subnet.target_mount_subnet.arn
}
}

resource "aws_datasync_task" "dag_sync" {
destination_location_arn = aws_datasync_location_efs.dag_destination.arn
name = "${var.name}-dags-delivery"
source_location_arn = aws_datasync_location_s3.dag_source.arn

options {
bytes_per_second = -1
}
}
38 changes: 38 additions & 0 deletions data_sync_generic.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

data "aws_iam_policy_document" "datasync-role-policy" {
statement {
actions = ["sts:AssumeRole"]

principals {
type = "Service"
identifiers = ["datasync.amazonaws.com"]
}
}
}

resource "aws_iam_role" "dags-datasync-task-role" {
name = "DagsDatasyncTaskRole"
path = "/airflow_module/"
assume_role_policy = data.aws_iam_policy_document.datasync-role-policy.json
}

resource "aws_iam_policy" "dags-datasync-task-policy" {
name = "dags-datasync-task-policy"
path = "/airflow_module/"
description = "Policy allowing Datasync to copy DAGs from s3 to EFS"

policy = file("${path.module}/templates/datasync_policy.json")
}

resource "aws_iam_role_policy_attachment" "dags-datasync-task" {
role = aws_iam_role.dags-datasync-task-role.name
policy_arn = aws_iam_policy.dags-datasync-task-policy.arn
}

data "aws_s3_bucket" "dags_bucket" {
bucket = var.dag_s3_bucket
}

data "aws_subnet" "target_mount_subnet" {
id = var.private_subnet_ids[0]
}
31 changes: 31 additions & 0 deletions data_sync_requirements.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# TODO (ilya_isakov): make custom `requirements.txt` file optional (currently, when no file provided it creates a folder in container) move realted resources to separate .tf file
resource "aws_datasync_location_s3" "requirements_source" {
s3_bucket_arn = data.aws_s3_bucket.dags_bucket.arn
subdirectory = "/${var.requirements_s3_key}"

s3_config {
bucket_access_role_arn = aws_iam_role.dags-datasync-task-role.arn
}

tags = var.tags
}

resource "aws_datasync_location_efs" "requirements_destination" {
efs_file_system_arn = module.efs.arn
subdirectory = "/requirements"

ec2_config {
security_group_arns = [aws_security_group.sg_airflow_internal.arn]
subnet_arn = data.aws_subnet.target_mount_subnet.arn
}
}

resource "aws_datasync_task" "requirements_sync" {
destination_location_arn = aws_datasync_location_efs.requirements_destination.arn
name = "${var.name}-requirements-delivery"
source_location_arn = aws_datasync_location_s3.requirements_source.arn

options {
bytes_per_second = -1
}
}
3 changes: 3 additions & 0 deletions docs/dags.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#### DAGs delivery
Module deploys [Datasync task](../data_sync_dags.tf), which copies DAGs from S3
bucket to EFS. It could be triggered manually or in the DAGs delivery pipeline.
6 changes: 6 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Table of contents:
1. [DAGs](./dags.md)
1. [Logging](./logging.md)
1. [RBAC](rbac.md)
1. [Setup recommendations](./setup_recommendations.md)

19 changes: 19 additions & 0 deletions docs/logging.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#### Logging level
If you want to see more logs from AF webserver in CloudWatch, use following variable
`airflow_core_logging_level`.

#### Persisting logs
There are multiple log types produced by the module. First of all containers are
writing logs to standard output and those logs are delivered to [CloudWatch log
group](../cloudwatch.tf). We decided not to send all the logs to standard output, in
order to avoid a mess there. So logs from `$AIRFLOW_HOME/logs` are mounted to EFS.
The idea behind it is that those volumes could be mounted to container with some log
management system agent, which would deliver them to centralized solution like ELK
or Datadog.

On picture below you could see that logs from worker and webserver containers are
delivered to the same location on EFS, it is done in order to enable webserver to
retrieve worker (DAG) logs without using worker API. This approach was empirically
more reliable.

![Airflow components schema](./module_architecture.png)
Binary file modified docs/module_architecture.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 23 additions & 0 deletions docs/rbac.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#### RBAC Authentication
If you enabled RBAC with `var.airflow_webserver_rbac` - run the following command in
webserver container to configure the dummy user:
```bash
airflow create_user -r Admin -u admin -e admin@example.com -f admin -l user -p admin
```
After this is done you could create more users in webserver UI and remove the dummy
one.

#### RBAC with Fargate launch type
When Fargate mode is used it is no longer possible to ssh to EC2 instance and connect
to container. So you either need to run the cluster in EC2 mode first and create the
user with approach described above and the switch to Fargate. Or use custom docker
image and add following bash scrip to the `entrypoint.sh` file, right after `airflow
initdb`.
```bash
#create default user if no users with admin role exists
if airflow list_users 2>/dev/null | grep -q "Admin" ; then
echo "User with admin role already exists."
else
echo "Creating Airflow Admin User.." && airflow create_user -r Admin -u "admin" -p "admin" -f "Default" -l "User" -e "defaultuser@airflow.com"
fi
```
17 changes: 17 additions & 0 deletions docs/setup_recommendations.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#### Education environment (easiest to start)
Default variables values are configured in such a way, to make cluster up and
running as easy as possible.

#### Development environment
No special recommendations

#### Production environment
When using module in production environment we recommend to take closer look at
values of following variables:
1. rds_instance_class
2. webserver_task_definition_memory
3. webserver_task_definition_cpu
4. scheduler_task_definition_memory
5. scheduler_task_definition_cpu
6. worker_task_definition_memory
7. worker_task_definition_cpu
8 changes: 1 addition & 7 deletions ecs.tf
Original file line number Diff line number Diff line change
@@ -125,7 +125,7 @@ resource "aws_ecs_task_definition" "webserver" {
}

resource "aws_ecs_task_definition" "scheduler" {
family = "${var.name}-scheduler" # var.task_definition_family
family = "${var.name}-scheduler"
container_definitions = data.template_file.scheduler.rendered
memory = var.scheduler_task_definition_memory
cpu = var.scheduler_task_definition_cpu
@@ -297,11 +297,5 @@ data "template_file" "worker" {
airflow_smtp_password = var.airflow_smtp_password
airflow_smtp_mail_from = var.airflow_smtp_mail_from
airflow_docker_image = var.airflow_image

# airflow_home = var.airflow_home
# airflow_webserver_rbac = var.airflow_webserver_rbac
# airflow_core_dag_concurrency = var.airflow_core_dag_concurrency
# airflow_webserver_dag_orientation = var.airflow_webserver_dag_orientation
# airflow_scheduler_dag_dir_list_interval = var.airflow_scheduler_dag_dir_list_interval
}
}
9 changes: 1 addition & 8 deletions asg.tf → ecs_container_instences.tf
Original file line number Diff line number Diff line change
@@ -62,11 +62,4 @@ resource "aws_autoscaling_group" "ecs" {
propagate_at_launch = true
}
}

}

resource "aws_cloudwatch_log_group" "ecs_cloudwatch_logs" {
name = "/ecs/${var.name}"
retention_in_days = var.cloudwatch_retention
tags = var.tags
}
}
102 changes: 1 addition & 101 deletions efs.tf
Original file line number Diff line number Diff line change
@@ -10,104 +10,4 @@ module "efs" {
encrypted = true

tags = var.tags
}

data "aws_iam_policy_document" "datasync-role-policy" {
statement {
actions = ["sts:AssumeRole"]

principals {
type = "Service"
identifiers = ["datasync.amazonaws.com"]
}
}
}

resource "aws_iam_role" "dags-datasync-task-role" {
name = "DagsDatasyncTaskRole"
path = "/airflow_module/"
assume_role_policy = data.aws_iam_policy_document.datasync-role-policy.json
}

resource "aws_iam_policy" "dags-datasync-task-policy" {
name = "dags-datasync-task-policy"
path = "/airflow_module/"
description = "Policy allowing Datasync to copy DAGs from s3 to EFS"

policy = file("${path.module}/templates/datasync_policy.json")
}

resource "aws_iam_role_policy_attachment" "dags-datasync-task" {
role = aws_iam_role.dags-datasync-task-role.name
policy_arn = aws_iam_policy.dags-datasync-task-policy.arn
}

data "aws_s3_bucket" "dags_bucket" {
bucket = var.dag_s3_bucket
}

data "aws_subnet" "target_mount_subnet" {
id = var.private_subnet_ids[0]
}

resource "aws_datasync_location_s3" "dag_source" {
s3_bucket_arn = data.aws_s3_bucket.dags_bucket.arn
subdirectory = "/${var.dag_s3_key}"

s3_config {
bucket_access_role_arn = aws_iam_role.dags-datasync-task-role.arn
}

tags = var.tags
}

resource "aws_datasync_location_efs" "dag_destination" {
efs_file_system_arn = module.efs.arn
subdirectory = "/usr/local/airflow/dags"

ec2_config {
security_group_arns = [aws_security_group.sg_airflow_internal.arn]
subnet_arn = data.aws_subnet.target_mount_subnet.arn
}
}

resource "aws_datasync_task" "dag_sync" {
destination_location_arn = aws_datasync_location_efs.dag_destination.arn
name = "${var.name}-dags-delivery"
source_location_arn = aws_datasync_location_s3.dag_source.arn

options {
bytes_per_second = -1
}
}

resource "aws_datasync_location_s3" "requirements_source" {
s3_bucket_arn = data.aws_s3_bucket.dags_bucket.arn
subdirectory = "/${var.requirements_s3_key}"

s3_config {
bucket_access_role_arn = aws_iam_role.dags-datasync-task-role.arn
}

tags = var.tags
}

resource "aws_datasync_location_efs" "requirements_destination" {
efs_file_system_arn = module.efs.arn
subdirectory = "/requirements"

ec2_config {
security_group_arns = [aws_security_group.sg_airflow_internal.arn]
subnet_arn = data.aws_subnet.target_mount_subnet.arn
}
}

resource "aws_datasync_task" "requirements_sync" {
destination_location_arn = aws_datasync_location_efs.requirements_destination.arn
name = "${var.name}-requirements-delivery"
source_location_arn = aws_datasync_location_s3.requirements_source.arn

options {
bytes_per_second = -1
}
}
}
11 changes: 11 additions & 0 deletions route53_record.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
resource "aws_route53_record" "this" {
zone_id = var.dns_zone_id
name = var.name
type = "A"

alias {
name = module.aws_alb.this_lb_dns_name
zone_id = module.aws_alb.this_lb_zone_id
evaluate_target_health = true
}
}
2 changes: 1 addition & 1 deletion templates/webserver.json
Original file line number Diff line number Diff line change
@@ -88,7 +88,7 @@
"sourceVolume": "requirements"
},
{
"readOnly": true,
"readOnly": false,
"containerPath": "/usr/local/airflow/logs",
"sourceVolume": "worker_logs"
}

0 comments on commit 6db8135

Please sign in to comment.