Skip to content

initial tf apply fails, but re-plan and re-apply fix it #384

@rocoll

Description

@rocoll

Terraform Version Details

I run terraform from inside the viya4-iac-aws container.

Command definition:

$ alias terraform

alias terraform='docker container run --rm --group-add root --user 1000:977 -v /home/cloud-user/.aws:/.aws -v /home/cloud-user/.ssh:/.ssh -v /home/cloud-user/viya4-iac-aws:/workspace --entrypoint terraform viya4-iac-aws'

TF version:

$ terraform version

Terraform v1.10.5
on linux_amd64
+ provider registry.terraform.io/hashicorp/aws v5.100.0
+ provider registry.terraform.io/hashicorp/cloudinit v2.3.7
+ provider registry.terraform.io/hashicorp/external v2.3.5
+ provider registry.terraform.io/hashicorp/kubernetes v2.38.0
+ provider registry.terraform.io/hashicorp/local v2.5.3
+ provider registry.terraform.io/hashicorp/null v3.2.4
+ provider registry.terraform.io/hashicorp/random v3.7.2
+ provider registry.terraform.io/hashicorp/time v0.13.1
+ provider registry.terraform.io/hashicorp/tls v4.1.0

Terraform Variable File Details

Nothing special here.

# REQUIRED VARIABLES
# Necessary for use by the IAC
# --------------------------------------
# - Prefix is used for naming resources for easy identification
# - Location is the geo region where resources will be placed
#
prefix                                  = "crest-p41756"
location                                = "us-east-1" 

# ACCESS, IDENTITY, and AUTHENTICATION
# Who is doing what where
# --------------------------------------
#
aws_profile                             = "default"            # or whatever you named it
ssh_public_key                          = "~/.ssh/id_rsa.pub"
create_static_kubeconfig                = true

# CIDR
# Specify public access CIDR to allow ingress traffic to the EKS cluster
# --------------------------------------
# - Define access from RACE VMWARE and RACE Azure clients networks
#
default_public_access_cidrs         = ["149.173.0.0/16", "52.226.102.80/32", "52.226.102.81/32"]

# TAGS
# Optional metadata associated with AWS resources
# --------------------------------------
# - Resourceowner makes it easy to find associated resources
# - Project_Name and GEL_Project are for tracking
# - Chronos (old) and Smart Parking (new) are SAS IT programs to auto-shutdown resources
# - GEL Smart Parking Exemption: RITM0988495
#
tags = { "resourceowner"          = "crest-p41756", 
        "project_name"           = "PSGEL297", 
        "gel_project"            = "PSGEL297", 
        "disable_chronos"        = "True", 
        "smart_parking_disabled" = "True" 
    }

# EXTERNAL POSTGRES SERVER
# --------------------------------------
# - if defined, creates an External Postgres Server in AWS, else use internal Crunchy
#
#postgres_servers = {
#  default = {},
#}

## Cluster config
kubernetes_version                      = "1.31"

default_nodepool_node_count             = 2
default_nodepool_vm_type                = "m7i-flex.2xlarge"
default_nodepool_custom_data            = ""

## General
efs_performance_mode                    = "generalPurpose"
storage_type                            = "standard"

## Cluster Node Pools config
node_pools = {
cas = {
    "vm_type" = "m7i-flex.2xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes" = 4
    "max_nodes" = 5
    "node_taints" = ["workload.sas.com/class=cas:NoSchedule"]
    "node_labels" = {
    "workload.sas.com/class" = "cas"
    }
    "custom_data" = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
},
compute = {
    "vm_type" = "m7i-flex.8xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes" = 1
    "max_nodes" = 5
    "node_taints" = ["workload.sas.com/class=compute:NoSchedule"]
    "node_labels" = {
    "workload.sas.com/class"        = "compute"
    "launcher.sas.com/prepullImage" = "sas-programming-environment"
    }
    "custom_data" = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
},
connect = {
    "vm_type" = "m7i-flex.8xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes" = 1
    "max_nodes" = 5
    "node_taints" = ["workload.sas.com/class=connect:NoSchedule"]
    "node_labels" = {
    "workload.sas.com/class"        = "connect"
    "launcher.sas.com/prepullImage" = "sas-programming-environment"
    }
    "custom_data" = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
},
stateless = {
    "vm_type" = "m7i-flex.4xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes" = 1
    "max_nodes" = 5
    "node_taints" = ["workload.sas.com/class=stateless:NoSchedule"]
    "node_labels" = {
    "workload.sas.com/class" = "stateless"
    }
    "custom_data" = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
},
stateful = {
    "vm_type" = "m7i-flex.4xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes" = 1
    "max_nodes" = 3
    "node_taints" = ["workload.sas.com/class=stateful:NoSchedule"]
    "node_labels" = {
    "workload.sas.com/class" = "stateful"
    }
    "custom_data" = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
}
}

# Jump Server
create_jump_vm                        = true
jump_vm_admin                         = "jumpuser"
jump_vm_type                          = "t3.small"

# NFS Server
# required ONLY when storage_type is "standard" to create NFS Server VM
create_nfs_public_ip                  = false
nfs_vm_admin                          = "nfsuser"
nfs_vm_type                           = "m7i-flex.xlarge"

Steps to Reproduce

Pretty much every first time I run "terraform apply".

Expected Behavior

I expect TF to complete provisioning AWS resources successfully.

Actual Behavior

The "terraform apply" fails with error:

╷
│ Error: Unauthorized
│ 
│   with module.kubeconfig.kubernetes_service_account.kubernetes_sa[0],
│   on modules/kubeconfig/main.tf line 66, in resource "kubernetes_service_account" "kubernetes_sa":
│   66: resource "kubernetes_service_account" "kubernetes_sa" {
│ 
╵

Additional Context

I've looked into a little bit and it seems that TF is running multiple tasks concurrently. When it gets to this point, one step is dependent on a resource (the k8s service acct) that doesn't exist yet. So TF errors out.

The workaround/fix for this is to simply re-plan (specifying the current tfstate file) and then re-apply:

# Note that I'm running TF from inside the viya4-iac-aws container
terraform plan \
     -input=false \
     -var-file=/workspace/${MY_PREFIX}.tfvars \
     -state /workspace/${MY_PREFIX}.tfstate \
     -out /workspace/${MY_PREFIX}.tfplan

To which it says that it will:

Plan: 6 to add, 2 to change, 0 to destroy.

And then "terraform apply" runs successfully after that and I have all expected AWS resources provisioned by the IAC.

Possibly related to Issue 345 because the internal discussions about that also note circumstances where required resources don't yet exist when the dependent objects need them. It seems there might be justification to consider re-structuring the "terraform apply" process to include multiple runs instead of just one so that required resources can be strictly provisioned before the dependent objects actually need them.

References

Issue 345

Code of Conduct

  • I agree to follow this project's Code of Conduct

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingnewAdded to an issue when it's new ;)

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions