Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
88 commits
Select commit Hold shift + click to select a range
d6922ee
Test Segment Data Lake
cyclops23 Feb 27, 2020
b19a544
Merge remote-tracking branch 'upstream/master'
cyclops23 Mar 5, 2020
68feb8a
Remove segment_data_lake Glue database
cyclops23 Mar 5, 2020
9c5cc29
Add core-api source
cyclops23 Mar 9, 2020
0c34133
Update to latest modules
cyclops23 Mar 10, 2020
ce4b7a5
Merge remote-tracking branch 'upstream/master'
cyclops23 Mar 18, 2020
cb099ce
Update IAM module
cyclops23 Mar 18, 2020
8fb6774
Update IAM module
cyclops23 Mar 18, 2020
d57c6dc
Add Slicelink/Braze
cyclops23 Mar 27, 2020
a66563b
Add partner-websites-prod / slice-os-prod sources
cyclops23 Apr 3, 2020
57fe630
adding keys for production segment
Jul 21, 2020
1fb8e19
Merge pull request #1 from slicelife/richardgilmore/ch111860/ingest-a…
cyclops23 Jul 21, 2020
034b0b8
Merge remote-tracking branch 'upstream/master'
cyclops23 Jul 22, 2020
ab6ca9f
Merge pull request #2 from slicelife/update-upstream
cyclops23 Jul 22, 2020
4330cf6
Update main.tf
Jul 23, 2020
7ddd4ee
added identifiers
Jul 23, 2020
6008b58
Update main.tf
Jul 23, 2020
478e33d
Merge pull request #3 from slicelife/dev-sources
cyclops23 Jul 23, 2020
1248470
Update _provider.tf
Aug 25, 2020
7ecb681
Update _provider.tf
Aug 25, 2020
797e587
Update main.tf
Aug 25, 2020
87722a1
Pin aws provider to 2.50
cyclops23 Aug 25, 2020
b111c49
Pin s3 module to 1.9
cyclops23 Aug 25, 2020
3a1bbb6
Merge pull request #4 from slicelife/aws-provider-patch
cyclops23 Aug 25, 2020
5a613cf
Adding braze dev and prod
ronanbradley Sep 10, 2020
3d3a2c3
Merge pull request #8 from slicelife/ch107729/import-braze-data-to-da…
Sep 10, 2020
28966ff
Update Segment EMR version
maria-dobrska Sep 16, 2020
4b44907
Merge latest segmentio
maria-dobrska Sep 16, 2020
d0ad790
iam module fixes after merge from segmentio
maria-dobrska Sep 16, 2020
f127754
Revert "iam module fixes after merge from segmentio"
maria-dobrska Sep 16, 2020
f57d578
Segment IAM role fix?
maria-dobrska Sep 16, 2020
f7bd4ac
Revert "Segment IAM role fix?"
maria-dobrska Sep 16, 2020
3b81a54
Segment IAM role fix maybe this time?
maria-dobrska Sep 16, 2020
f051aee
Segment IAM role fix maybe this time?
maria-dobrska Sep 16, 2020
7a20102
Segment IAM role fix maybe this time?
maria-dobrska Sep 16, 2020
cf2eba7
Segment IAM role fix maybe this time?
maria-dobrska Sep 16, 2020
73d1908
Revert "Segment IAM role fix maybe this time?"
maria-dobrska Sep 16, 2020
7dd6c9c
Apply dev to run from feature branch
maria-dobrska Sep 16, 2020
2df1122
Merge pull request #9 from slicelife/dobrska/ch125447/update-segment-…
maria-dobrska Sep 21, 2020
f529ab8
Fix IAM roles and profiles addresses in dev - test
maria-dobrska Sep 23, 2020
83e22b2
Fix IAM roles and profiles addresses in dev and prod
maria-dobrska Sep 24, 2020
2984db0
Merge pull request #12 from slicelife/dobrska/ch125447/update-segment…
maria-dobrska Sep 24, 2020
7f0f021
add register to production segment cluster
Mar 29, 2021
a124e73
Update main.tf
Mar 29, 2021
5f73184
Update main.tf
Mar 29, 2021
1a19d4a
Merge pull request #13 from slicelife/richardgilmore/ch172132/ingest-…
Mar 29, 2021
a5e7683
drivers app segment source external keys
maria-dobrska Jun 10, 2021
5c320ff
Merge pull request #14 from slicelife/dobrska/ch188909/logistics-tent…
maria-dobrska Jun 10, 2021
431c3ac
add set cluster script
Dec 1, 2021
3385114
add set cluster script
Dec 1, 2021
c5637d2
add cluster
Dec 1, 2021
0ccf8ae
add set cluster script
Dec 1, 2021
363c9ae
add set cluster script
Dec 1, 2021
6bceb0e
add cluster
Dec 1, 2021
b8da29b
add cluster
Dec 1, 2021
f3b910d
add cluster
Dec 2, 2021
dbddce1
Merge pull request #15 from slicelife/set_cluster_script
Dec 2, 2021
4e9fc38
increase production cores
Dec 2, 2021
2c9db06
Merge pull request #16 from slicelife/increase-nodes-in-production
Dec 2, 2021
0224e1f
adding tags for EMR repos
gilandose Dec 20, 2021
c3ed646
add paren
gilandose Dec 20, 2021
c4b1cb4
Merge pull request #17 from slicelife/default-tags
Dec 20, 2021
6be42c3
adding tags for older aws provider
Dec 21, 2021
145ff7c
adding tags for older aws provider
Dec 21, 2021
c5680d9
Merge pull request #18 from slicelife/data-tagging
Dec 21, 2021
4219233
add tags in production
Dec 21, 2021
566220e
Merge pull request #19 from slicelife/prod-tagging
Dec 21, 2021
c4e5897
add paren
gilandose Mar 7, 2022
8b63696
Update main.tf
gilandose Mar 7, 2022
c299d6e
Update main.tf
gilandose Mar 7, 2022
7143927
Merge pull request #20 from slicelife/gilandose-patch-1
gilandose Mar 7, 2022
6f420dd
Update main.tf
gilandose Mar 7, 2022
b2b7843
Update main.tf
gilandose Mar 7, 2022
82ea7c4
Update main.tf
gilandose Mar 7, 2022
342e4d6
update to correct account id
gilandose Apr 12, 2022
f68b489
Merge pull request #22 from slicelife/hotfix/dataisland-account-id
gilandose Apr 12, 2022
36ca1b1
change to use rules
gilandose May 9, 2022
996b0b8
add rule keyword
gilandose May 9, 2022
bf1c5ae
Merge pull request #23 from slicelife/change-ci-to-rules
gilandose May 9, 2022
6132501
Update .gitlab-ci.yml
gilandose May 9, 2022
fab30c5
Update .gitlab-ci.yml
gilandose May 12, 2022
b078e6e
Update .gitlab-ci.yml
gilandose May 12, 2022
d49695b
Update .gitlab-ci.yml
gilandose May 12, 2022
a1d908c
Update .gitlab-ci.yml
gilandose May 12, 2022
e1bef49
fix file location
gilandose Oct 6, 2022
d669ba7
path change in files
gilandose Oct 6, 2022
5251d3d
Update main.tf
gilandose Feb 15, 2023
ad0702c
chore: Switch gp2 EBS volumes to gp3
LanaM1996 May 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ override.tf.json
# End of https://www.gitignore.io/api/osx,ruby,terraform,visualstudiocode

.kitchen/

.idea/
91 changes: 91 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
stages:
- plan_terraform
- apply_terraform

.plan_terraform: &plan_terraform
stage: plan_terraform
script:
- cd ${TF_ENVIRONMENT}
- terraform init -no-color
- terraform plan -no-color -out=tfplan

artifacts:
untracked: true
expire_in: 1 day
paths:
- ${TF_ENVIRONMENT}/.terraform/**/*
- ${TF_ENVIRONMENT}/tfplan

plan_terraform_dev:
image: docker.dev.slicelife.com/gitlab-runner-terraform-12:stable
<<: *plan_terraform
variables:
IAM_ROLE: $IAM_ROLE_TERRAFORM_DEV
TF_ENVIRONMENT: development
VAULT_ADDR: $VAULT_ADDR_DEV
VAULT_TOKEN: $VAULT_TOKEN_DEV
tags:
- development

plan_terraform_prod:
image: docker.prod.slicelife.com/gitlab-runner-terraform-12:stable
<<: *plan_terraform
variables:
IAM_ROLE: $IAM_ROLE_TERRAFORM_PROD
TF_ENVIRONMENT: production
VAULT_ADDR: $VAULT_ADDR_PROD
VAULT_TOKEN: $VAULT_TOKEN_PROD
script:
- cd ${TF_ENVIRONMENT}
- terraform init -no-color
- terraform taint module.segment.null_resource.segment-setup
- terraform plan -no-color -out=tfplan
tags:
- production
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_PIPELINE_SOURCE == "trigger"

.apply_terraform: &apply_terraform
stage: apply_terraform
script:
- cd ${TF_ENVIRONMENT}
- terraform apply -no-color tfplan

artifacts:
paths:
- ${TF_ENVIRONMENT}/.terraform

apply_terraform_dev:
<<: *apply_terraform
image: docker.dev.slicelife.com/gitlab-runner-terraform-12:stable
variables:
IAM_ROLE: $IAM_ROLE_TERRAFORM_DEV
TF_ENVIRONMENT: development
VAULT_ADDR: $VAULT_ADDR_DEV
VAULT_TOKEN: $VAULT_TOKEN_DEV
tags:
- development
rules:
- if: $CI_PIPELINE_SOURCE == "trigger"
- when: manual
allow_failure: false

apply_terraform_prod:
<<: *apply_terraform
image: docker.prod.slicelife.com/gitlab-runner-terraform-12:stable
variables:
IAM_ROLE: $IAM_ROLE_TERRAFORM_PROD
TF_ENVIRONMENT: production
VAULT_ADDR: $VAULT_ADDR_PROD
VAULT_TOKEN: $VAULT_TOKEN_PROD
script:
- cd ${TF_ENVIRONMENT}
- terraform apply -no-color tfplan
tags:
- production
rules:
- if: $CI_PIPELINE_SOURCE == "trigger"
when: on_success
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: manual
3 changes: 0 additions & 3 deletions CHANGELOG.md

This file was deleted.

89 changes: 77 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,35 @@
# terraform-aws-data-lake
# terraform-aws-data-lake (forked)

Forked version of the Segment repo that adds Slice CI/CD.

Updating from the upstream Segment repo to pull in latest changes:
```bash
git remote add upstream git@github.com:segmentio/terraform-aws-data-lake.git
git merge upstream/master
```

Add the sources in Segment: https://app.segment.com/mypizza-zach/destinations/catalog/data-lakes

| Param | Value |
| ------------ |:--------------|
| Region: | us-east-1
| Cluster ID: | j-1T2VIUV1YG249 (ID of the EMR cluster)
| Glue Catalog | 651565136086
| Database Name: | segment_data_lake
| IAM Role: | arn:aws:iam::651565136086:role/segment-data-lake-iam-role
| S3 Bucket: | 651565136086-slice-segment-data-lake

Note: **If the Terraform apply recreates the EMR cluster then all Segment destinations wil need to be updated with the new Cluster ID!**

_Note: Data Lakes is currently in Limited Availability._

# terraform-aws-data-lake

Terraform modules which create AWS resources for a Segment Data Lake.

# Prerequisites

* Accept the [Data Lakes Terms of Service](https://app.segment.com/{workspace_slug}/destinations/catalog?category=DataLakes) (replace the `{workspace_slug}` with your workspace slug).
* Authorized [AWS account](https://aws.amazon.com/account/).
* Ability to run Terraform with your AWS Account. You must use Terraform 0.11 or higher.
* Ability to run Terraform with your AWS Account. Terraform 0.11 and older are supported.
* A subnet within a VPC for the EMR cluster to run in.
* An [S3 Bucket](https://github.com/terraform-aws-modules/terraform-aws-s3-bucket) for Segment to load data into. You can create a new one just for this, or re-use an existing one you already have.

Expand Down Expand Up @@ -57,7 +78,11 @@ mkdir segment-datalakes-tf

```hcl
provider "aws" {
region = "us-west-2" # Replace this with the AWS region your infrastructure is set up in.
# Replace this with the AWS region your infrastructure is set up in.
region = "us-west-2"

# Currently our modules require the older v2 AWS provider, as upgrading to v3 has notable breaking changes.
version = "~> 2"
}

locals {
Expand All @@ -71,26 +96,45 @@ locals {

# This is the target where Segment will write your data.
# You can skip this if you already have an S3 bucket and just reference that name manually later.
# If you decide to skip this and use an existing bucket, ensure that you attach a 14 day expiration lifecycle policy to
# your S3 bucket for the "segment-stage/" prefix.
resource "aws_s3_bucket" "segment_datalake_s3" {
name = "my-first-segment-datalake"
bucket = "my-first-segment-datalake"

lifecycle_rule {
enabled = true

prefix = "segment-stage/"

expiration {
days = 14
}

abort_incomplete_multipart_upload_days = 7
}
}

# Creates the IAM Policy that allows Segment to access the necessary resources
# in your AWS account for loading your data.
module "iam" {
source = "git@github.com:segmentio/terraform-aws-data-lake//modules/iam?ref=v0.2.0"
source = "git@github.com:segmentio/terraform-aws-data-lake//modules/iam?ref=v0.4.0"

# Suffix is not strictly required if only initializing this module once.
# However, if you need to initialize multiple times across different Terraform
# workspaces, this hook allows the generated IAM policies to be given unique
# names.
suffix = "-prod"

name = "segment-data-lake-iam-role"
s3_bucket = "${aws_s3_bucket.segment_datalake_s3.name}"
s3_bucket = "${aws_s3_bucket.segment_datalake_s3.id}"
external_ids = "${values(local.segment_sources)}"
}

# Creates an EMR Cluster that Segment uses for performing the final ETL on your
# data that lands in S3.
module "emr" {
source = "git@github.com:segmentio/terraform-aws-data-lake//modules/emr?ref=v0.2.0"
source = "git@github.com:segmentio/terraform-aws-data-lake//modules/emr?ref=v0.4.0"

s3_bucket = "${aws_s3_bucket.segment_datalake_s3.name}"
s3_bucket = "${aws_s3_bucket.segment_datalake_s3.id}"
subnet_id = "subnet-XXX" # Replace this with the subnet ID you want the EMR cluster to run in.

# LEAVE THIS AS-IS
Expand All @@ -99,6 +143,7 @@ module "emr" {
iam_emr_instance_profile = "${module.iam.iam_emr_instance_profile}"
}
```

## Provision Resources
* Provide AWS credentials of the account being used. More details here: https://www.terraform.io/docs/providers/aws/index.html
```
Expand Down Expand Up @@ -180,7 +225,9 @@ If all else fails, teardown and start over.

Terraform 0.11 or higher is supported.

NOTE: Release v0.2.0 onwards only Terraform 0.12 or higher is supported.
In order to support more versions of Terraform, the AWS Provider needs to held at v2,
as v3 has breaking changes we don't currently support. Our example `main.tf` has the
code to accomplish this.

# Development

Expand All @@ -194,6 +241,24 @@ To develop in this repository, you'll want the following tools set up:

To run unit tests, you also need an AWS account to be able to provision resources.

# Releasing

Releases are made from the master branch. First, make sure you have the last code from master pulled locally:

```
* git remote update
* git checkout master
* git reset origin/master --hard
```

Then, use [`git release`](https://github.com/tj/git-extras/blob/master/Commands.md#git-release) to cut a new version that follows [semver](https://semver.org):

```
git release x.y.z
```

Lastly, craft a new [Github release](https://github.com/segmentio/terraform-aws-data-lake/releases).

# License

Released under the [MIT License](https://opensource.org/licenses/MIT).
9 changes: 9 additions & 0 deletions development/_backend.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
terraform {
backend "s3" {
bucket = "211459479356-terraform-state"
key = "data-engineering/segment-data-lake.tfstate"
region = "us-east-2"
role_arn = "arn:aws:iam::211459479356:role/terraform"
encrypt = true
}
}
9 changes: 9 additions & 0 deletions development/_provider.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
provider "aws" {
region = "us-east-1"
profile = "development"
version = "2.50"
}

terraform {
required_version = "~> 0.12.0"
}
69 changes: 69 additions & 0 deletions development/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
locals {
s3_bucket_name = "211459479356-slice-segment-data-lake"
external_ids = [
"KnSuvLUHMG", #Direct-Web-QA,
"42ukzMtDak", #Storefront-QA
"e398yK2CXsbPJFbTRU9Nf8" #Braze-Dev
] # Segment sources that will be enabled for Data Lakes.
subnet_id = "subnet-097e2dc4f7499f77a" # Subnet the EMR cluster will run in.
arn_prefix = "arn:aws:iam::211459479356"
default_tags = {
department = "data"
subteam = "dataeng"
git = "https://github.com/slicelife/terraform-aws-data-lake/"
environment = "development"
terraformed = "yes"
}
}

locals {
tags = {
s3_bucket_name = "${local.s3_bucket_name}"
external_ids = "${local.external_ids}"
subnet_id = "${local.subnet_id}"
}
}

data "aws_secretsmanager_secret_version" "segment_secrets" {
secret_id = "dataeng/segment"
}

module "s3_bucket" {
source = "../modules/s3_bucket"
s3_bucket = local.s3_bucket_name
tags = local.default_tags
data_account = 409386690817
}

module "glue" {
source = "https://github.com/segmentio/terraform-aws-data-lake/archive/v0.2.0.zip//terraform-segment-data-lakes-0.2.0/modules/glue"

name = "segment_data_lake"
}

module "iam" {
source = "https://github.com/segmentio/terraform-aws-data-lake/archive/v0.3.0.zip//terraform-segment-data-lakes-0.3.0/modules/iam"

s3_bucket = "${local.s3_bucket_name}"
external_ids = "${local.external_ids}"
tags = local.default_tags
}

module "emr" {
source = "https://github.com/segmentio/terraform-aws-data-lake/archive/v0.3.0.zip//terraform-segment-data-lakes-0.3.0/modules/emr"

s3_bucket = "${local.s3_bucket_name}"
subnet_id = "${local.subnet_id}"
iam_emr_autoscaling_role = "${local.arn_prefix}:role/${module.iam.iam_emr_autoscaling_role}"
iam_emr_service_role = "${local.arn_prefix}:role/${module.iam.iam_emr_service_role}"
iam_emr_instance_profile = "${local.arn_prefix}:instance-profile/${module.iam.iam_emr_instance_profile}"
tags = local.default_tags
}

module "segment" {
source = "../modules/segment"
cluster_id = module.emr.cluster_id
environment = "dev"
token = jsondecode(data.aws_secretsmanager_secret_version.segment_secrets.secret_string)["token"]
url = jsondecode(data.aws_secretsmanager_secret_version.segment_secrets.secret_string)["url"]
}
Loading