Delete unused prometheus volume on cluster down (#1863)

Miguel Varela Ramos · web-flow · commit 66c9545fa370 · 2021-03-02T00:43:06.000+02:00
diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
@@ -58,6 +58,7 @@ var (
 	_flagClusterRegion             string
 	_flagClusterInfoDebug          bool
 	_flagClusterDisallowPrompt     bool
+	_flagClusterDownKeepVolumes    bool
 	_flagAWSAccessKeyID            string
 	_flagAWSSecretAccessKey        string
 	_flagClusterAWSAccessKeyID     string
@@ -97,6 +98,7 @@ func clusterInit() {
 	addClusterRegionFlag(_clusterDownCmd)
 	addAWSCredentialsFlags(_clusterDownCmd)
 	_clusterDownCmd.Flags().BoolVarP(&_flagClusterDisallowPrompt, "yes", "y", false, "skip prompts")
+	_clusterDownCmd.Flags().BoolVar(&_flagClusterDownKeepVolumes, "keep-volumes", false, "keep cortex provisioned persistent volumes")
 	_clusterCmd.AddCommand(_clusterDownCmd)
 
 	_clusterExportCmd.Flags().SortFlags = false
@@ -487,7 +489,13 @@ var _clusterDownCmd = &cobra.Command{
 		}
 
 		fmt.Print("￮ spinning down the cluster ...")
-		out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsClient, nil, nil)
+
+		uninstallCmd := "/root/uninstall.sh"
+		if _flagClusterDownKeepVolumes {
+			uninstallCmd += " --keep-volumes"
+		}
+
+		out, exitCode, err := runManagerAccessCommand(uninstallCmd, *accessConfig, awsClient, nil, nil)
 		if err != nil {
 			errors.PrintError(err)
 			fmt.Println()
diff --git a/cli/cmd/cluster_gcp.go b/cli/cmd/cluster_gcp.go
@@ -40,14 +40,15 @@ import (
 )
 
 var (
-	_flagClusterGCPUpEnv          string
-	_flagClusterGCPInfoEnv        string
-	_flagClusterGCPInfoDebug      bool
-	_flagClusterGCPConfig         string
-	_flagClusterGCPName           string
-	_flagClusterGCPZone           string
-	_flagClusterGCPProject        string
-	_flagClusterGCPDisallowPrompt bool
+	_flagClusterGCPUpEnv           string
+	_flagClusterGCPInfoEnv         string
+	_flagClusterGCPInfoDebug       bool
+	_flagClusterGCPConfig          string
+	_flagClusterGCPName            string
+	_flagClusterGCPZone            string
+	_flagClusterGCPProject         string
+	_flagClusterGCPDisallowPrompt  bool
+	_flagClusterGCPDownKeepVolumes bool
 )
 
 func clusterGCPInit() {
@@ -73,6 +74,7 @@ func clusterGCPInit() {
 	addClusterGCPProjectFlag(_clusterGCPDownCmd)
 	addClusterGCPZoneFlag(_clusterGCPDownCmd)
 	addClusterGCPDisallowPromptFlag(_clusterGCPDownCmd)
+	_clusterGCPDownCmd.Flags().BoolVar(&_flagClusterGCPDownKeepVolumes, "keep-volumes", false, "keep cortex provisioned persistent volumes")
 	_clusterGCPCmd.AddCommand(_clusterGCPDownCmd)
 }
 
@@ -263,6 +265,30 @@ var _clusterGCPDownCmd = &cobra.Command{
 
 		fmt.Print("￮ spinning down the cluster ")
 
+		uninstallCmd := "/root/uninstall.sh"
+		if _flagClusterGCPDownKeepVolumes {
+			uninstallCmd += " --keep-volumes"
+		}
+		output, exitCode, err := runGCPManagerAccessCommand(uninstallCmd, *accessConfig, nil, nil)
+		if (exitCode != nil && *exitCode != 0) || err != nil {
+			if len(output) == 0 {
+				fmt.Printf("\n")
+			}
+			fmt.Print("\n")
+
+			gkePvcDiskPrefix := fmt.Sprintf("gke-%s", *accessConfig.ClusterName)
+			if err != nil {
+				fmt.Print(fmt.Sprintf("￮ failed to delete persistent disks from storage, please visit https://console.cloud.google.com/compute/disks?project=%s to manually delete the disks starting with the %s prefix: %s", *accessConfig.Project, gkePvcDiskPrefix, err.Error()))
+				telemetry.Error(ErrorClusterDown(err.Error()))
+			} else {
+				fmt.Print(fmt.Sprintf("￮ failed to delete persistent disks from storage, please visit https://console.cloud.google.com/compute/disks?project=%s to manually delete the disks starting with the %s prefix", *accessConfig.Project, gkePvcDiskPrefix))
+				telemetry.Error(ErrorClusterDown(output))
+			}
+
+			fmt.Print("\n\n")
+			fmt.Print("￮ proceeding with best-effort deletion of the cluster ")
+		}
+
 		_, err = gcpClient.DeleteCluster(gkeClusterName)
 		if err != nil {
 			fmt.Print("\n\n")
diff --git a/docs/clusters/aws/uninstall.md b/docs/clusters/aws/uninstall.md
@@ -26,22 +26,17 @@ aws s3 rb --force s3://<bucket>
 aws logs describe-log-groups --log-group-name-prefix=<cluster_name> --query logGroups[*].[logGroupName] --output text | xargs -I {} aws logs delete-log-group --log-group-name {}
 ```
 
-## Delete Volumes
-
-The volumes used by Cortex's Prometheus and Grafana instances are not deleted by default, as they might contain important
-information. If these volumes are not required anymore, you can delete them in the AWS console.
-
-To delete the volumes, navigate to the [EC2 volumes page](https://console.aws.amazon.com/ec2/v2/home?#Volumes)
-in the AWS console (be sure to set the appropriate region), select the volumes, click "Actions" and then "Delete Volume"
-. Both volumes for Prometheus and Grafana that Cortex created have a name that starts with `kubernetes-dynamic-pvc`,
-the `kubernetes.io/cluster/<cluster name>` tag is set to `owned`, and the `kubernetes.io/created-for/pvc/name` tag start
-with `prometheus-` and `grafana-` respectively.
-
 ## Delete Certificates
 
 If you've configured a custom domain for your APIs, you can remove the SSL Certificate and Hosted Zone for the domain by
 following these [instructions](networking/custom-domain.md#cleanup).
 
+## Keep Cortex Volumes
+
+The volumes used by Cortex's Prometheus and Grafana instances are deleted by default on a cluster down operation.
+If you want to keep the metrics and dashboards volumes for any reason,
+you can pass the `--keep-volumes` flag to the `cortex cluster down` command.
+
 ## Troubleshooting
 
 On rare occasions, `cortex cluster down` may not be able to spin down your Cortex cluster. When this happens, follow
diff --git a/docs/clusters/gcp/uninstall.md b/docs/clusters/gcp/uninstall.md
@@ -10,11 +10,8 @@ cortex cluster-gcp down
 The `cortex cluster-gcp down` command doesn't wait for the cluster to spin down. You can ensure that the cluster has
 spun down by checking the GKE console.
 
-## Delete Volumes
+## Keep Cortex Volumes
 
-The volumes used by Cortex's Prometheus and Grafana instances are not deleted by default, as they might contain important
-information. If these volumes are not required anymore, you can delete them in the GCP console. Navigate to
-the [Disks](https://console.cloud.google.com/compute/disks) page (be sure to set the appropriate project), select the
-volumes, and click "Delete". The Prometheus and Grafana volumes that Cortex created have a name that starts
-with `gke-<cluster name>-`, and the `kubernetes.io/created-for/pvc/name` tag starts with `prometheus-` and `grafana-`
-respectively.
+The volumes used by Cortex's Prometheus and Grafana instances are deleted by default on a cluster down operation.
+If you want to keep the metrics and dashboards volumes for any reason,
+you can pass the `--keep-volumes` flag to the `cortex cluster-gcp down` command.
diff --git a/manager/uninstall.sh b/manager/uninstall.sh
@@ -18,8 +18,51 @@ set -e
 
 EKSCTL_TIMEOUT=45m
 
-echo
+arg1="$1"
 
-eksctl delete cluster --wait --name=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --timeout=$EKSCTL_TIMEOUT
+function main() {
+  if [ "$CORTEX_PROVIDER" == "aws" ]; then
+    uninstall_aws
+  elif [ "$CORTEX_PROVIDER" == "gcp" ]; then
+    uninstall_gcp
+  fi
+}
 
-echo -e "\n✓ done spinning down the cluster"
+function uninstall_gcp() {
+  gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS 2> /dev/stdout 1> /dev/null | (grep -v "Activated service account credentials" || true)
+  gcloud container clusters get-credentials $CORTEX_CLUSTER_NAME --project $CORTEX_GCP_PROJECT --region $CORTEX_GCP_ZONE 2> /dev/stdout 1> /dev/null | (grep -v "Fetching cluster" | grep -v "kubeconfig entry generated" || true)
+
+  if [ "$arg1" != "--keep-volumes" ]; then
+    uninstall_prometheus
+    uninstall_grafana
+  fi
+}
+
+function uninstall_aws() {
+  echo
+
+  aws eks --region $CORTEX_REGION update-kubeconfig --name $CORTEX_CLUSTER_NAME >/dev/null
+
+  if [ "$arg1" != "--keep-volumes" ]; then
+    uninstall_prometheus
+    uninstall_grafana
+  fi
+
+  eksctl delete cluster --wait --name=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --timeout=$EKSCTL_TIMEOUT
+  echo -e "\n✓ done spinning down the cluster"
+}
+
+function uninstall_prometheus() {
+  kubectl get configmap cluster-config -o jsonpath='{.data.cluster\.yaml}' > ./cluster.yaml
+
+  # delete resources to detach disk
+  python render_template.py ./cluster.yaml manifests/prometheus-monitoring.yaml.j2 | kubectl delete -f - >/dev/null
+  kubectl delete pvc --namespace default prometheus-prometheus-db-prometheus-prometheus-0 >/dev/null
+}
+
+function uninstall_grafana() {
+  kubectl delete statefulset --namespace default grafana >/dev/null
+  kubectl delete pvc --namespace default grafana-storage >/dev/null
+}
+
+main