Skip to content

Commit 66c9545

Browse files
author
Miguel Varela Ramos
authored
Delete unused prometheus volume on cluster down (#1863)
1 parent 28ba9a9 commit 66c9545

File tree

5 files changed

+99
-30
lines changed

5 files changed

+99
-30
lines changed

cli/cmd/cluster.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ var (
5858
_flagClusterRegion string
5959
_flagClusterInfoDebug bool
6060
_flagClusterDisallowPrompt bool
61+
_flagClusterDownKeepVolumes bool
6162
_flagAWSAccessKeyID string
6263
_flagAWSSecretAccessKey string
6364
_flagClusterAWSAccessKeyID string
@@ -97,6 +98,7 @@ func clusterInit() {
9798
addClusterRegionFlag(_clusterDownCmd)
9899
addAWSCredentialsFlags(_clusterDownCmd)
99100
_clusterDownCmd.Flags().BoolVarP(&_flagClusterDisallowPrompt, "yes", "y", false, "skip prompts")
101+
_clusterDownCmd.Flags().BoolVar(&_flagClusterDownKeepVolumes, "keep-volumes", false, "keep cortex provisioned persistent volumes")
100102
_clusterCmd.AddCommand(_clusterDownCmd)
101103

102104
_clusterExportCmd.Flags().SortFlags = false
@@ -487,7 +489,13 @@ var _clusterDownCmd = &cobra.Command{
487489
}
488490

489491
fmt.Print("○ spinning down the cluster ...")
490-
out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsClient, nil, nil)
492+
493+
uninstallCmd := "/root/uninstall.sh"
494+
if _flagClusterDownKeepVolumes {
495+
uninstallCmd += " --keep-volumes"
496+
}
497+
498+
out, exitCode, err := runManagerAccessCommand(uninstallCmd, *accessConfig, awsClient, nil, nil)
491499
if err != nil {
492500
errors.PrintError(err)
493501
fmt.Println()

cli/cmd/cluster_gcp.go

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,15 @@ import (
4040
)
4141

4242
var (
43-
_flagClusterGCPUpEnv string
44-
_flagClusterGCPInfoEnv string
45-
_flagClusterGCPInfoDebug bool
46-
_flagClusterGCPConfig string
47-
_flagClusterGCPName string
48-
_flagClusterGCPZone string
49-
_flagClusterGCPProject string
50-
_flagClusterGCPDisallowPrompt bool
43+
_flagClusterGCPUpEnv string
44+
_flagClusterGCPInfoEnv string
45+
_flagClusterGCPInfoDebug bool
46+
_flagClusterGCPConfig string
47+
_flagClusterGCPName string
48+
_flagClusterGCPZone string
49+
_flagClusterGCPProject string
50+
_flagClusterGCPDisallowPrompt bool
51+
_flagClusterGCPDownKeepVolumes bool
5152
)
5253

5354
func clusterGCPInit() {
@@ -73,6 +74,7 @@ func clusterGCPInit() {
7374
addClusterGCPProjectFlag(_clusterGCPDownCmd)
7475
addClusterGCPZoneFlag(_clusterGCPDownCmd)
7576
addClusterGCPDisallowPromptFlag(_clusterGCPDownCmd)
77+
_clusterGCPDownCmd.Flags().BoolVar(&_flagClusterGCPDownKeepVolumes, "keep-volumes", false, "keep cortex provisioned persistent volumes")
7678
_clusterGCPCmd.AddCommand(_clusterGCPDownCmd)
7779
}
7880

@@ -263,6 +265,30 @@ var _clusterGCPDownCmd = &cobra.Command{
263265

264266
fmt.Print("○ spinning down the cluster ")
265267

268+
uninstallCmd := "/root/uninstall.sh"
269+
if _flagClusterGCPDownKeepVolumes {
270+
uninstallCmd += " --keep-volumes"
271+
}
272+
output, exitCode, err := runGCPManagerAccessCommand(uninstallCmd, *accessConfig, nil, nil)
273+
if (exitCode != nil && *exitCode != 0) || err != nil {
274+
if len(output) == 0 {
275+
fmt.Printf("\n")
276+
}
277+
fmt.Print("\n")
278+
279+
gkePvcDiskPrefix := fmt.Sprintf("gke-%s", *accessConfig.ClusterName)
280+
if err != nil {
281+
fmt.Print(fmt.Sprintf("○ failed to delete persistent disks from storage, please visit https://console.cloud.google.com/compute/disks?project=%s to manually delete the disks starting with the %s prefix: %s", *accessConfig.Project, gkePvcDiskPrefix, err.Error()))
282+
telemetry.Error(ErrorClusterDown(err.Error()))
283+
} else {
284+
fmt.Print(fmt.Sprintf("○ failed to delete persistent disks from storage, please visit https://console.cloud.google.com/compute/disks?project=%s to manually delete the disks starting with the %s prefix", *accessConfig.Project, gkePvcDiskPrefix))
285+
telemetry.Error(ErrorClusterDown(output))
286+
}
287+
288+
fmt.Print("\n\n")
289+
fmt.Print("○ proceeding with best-effort deletion of the cluster ")
290+
}
291+
266292
_, err = gcpClient.DeleteCluster(gkeClusterName)
267293
if err != nil {
268294
fmt.Print("\n\n")

docs/clusters/aws/uninstall.md

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,22 +26,17 @@ aws s3 rb --force s3://<bucket>
2626
aws logs describe-log-groups --log-group-name-prefix=<cluster_name> --query logGroups[*].[logGroupName] --output text | xargs -I {} aws logs delete-log-group --log-group-name {}
2727
```
2828

29-
## Delete Volumes
30-
31-
The volumes used by Cortex's Prometheus and Grafana instances are not deleted by default, as they might contain important
32-
information. If these volumes are not required anymore, you can delete them in the AWS console.
33-
34-
To delete the volumes, navigate to the [EC2 volumes page](https://console.aws.amazon.com/ec2/v2/home?#Volumes)
35-
in the AWS console (be sure to set the appropriate region), select the volumes, click "Actions" and then "Delete Volume"
36-
. Both volumes for Prometheus and Grafana that Cortex created have a name that starts with `kubernetes-dynamic-pvc`,
37-
the `kubernetes.io/cluster/<cluster name>` tag is set to `owned`, and the `kubernetes.io/created-for/pvc/name` tag start
38-
with `prometheus-` and `grafana-` respectively.
39-
4029
## Delete Certificates
4130

4231
If you've configured a custom domain for your APIs, you can remove the SSL Certificate and Hosted Zone for the domain by
4332
following these [instructions](networking/custom-domain.md#cleanup).
4433

34+
## Keep Cortex Volumes
35+
36+
The volumes used by Cortex's Prometheus and Grafana instances are deleted by default on a cluster down operation.
37+
If you want to keep the metrics and dashboards volumes for any reason,
38+
you can pass the `--keep-volumes` flag to the `cortex cluster down` command.
39+
4540
## Troubleshooting
4641

4742
On rare occasions, `cortex cluster down` may not be able to spin down your Cortex cluster. When this happens, follow

docs/clusters/gcp/uninstall.md

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,8 @@ cortex cluster-gcp down
1010
The `cortex cluster-gcp down` command doesn't wait for the cluster to spin down. You can ensure that the cluster has
1111
spun down by checking the GKE console.
1212

13-
## Delete Volumes
13+
## Keep Cortex Volumes
1414

15-
The volumes used by Cortex's Prometheus and Grafana instances are not deleted by default, as they might contain important
16-
information. If these volumes are not required anymore, you can delete them in the GCP console. Navigate to
17-
the [Disks](https://console.cloud.google.com/compute/disks) page (be sure to set the appropriate project), select the
18-
volumes, and click "Delete". The Prometheus and Grafana volumes that Cortex created have a name that starts
19-
with `gke-<cluster name>-`, and the `kubernetes.io/created-for/pvc/name` tag starts with `prometheus-` and `grafana-`
20-
respectively.
15+
The volumes used by Cortex's Prometheus and Grafana instances are deleted by default on a cluster down operation.
16+
If you want to keep the metrics and dashboards volumes for any reason,
17+
you can pass the `--keep-volumes` flag to the `cortex cluster-gcp down` command.

manager/uninstall.sh

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,51 @@ set -e
1818

1919
EKSCTL_TIMEOUT=45m
2020

21-
echo
21+
arg1="$1"
2222

23-
eksctl delete cluster --wait --name=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --timeout=$EKSCTL_TIMEOUT
23+
function main() {
24+
if [ "$CORTEX_PROVIDER" == "aws" ]; then
25+
uninstall_aws
26+
elif [ "$CORTEX_PROVIDER" == "gcp" ]; then
27+
uninstall_gcp
28+
fi
29+
}
2430

25-
echo -e "\n✓ done spinning down the cluster"
31+
function uninstall_gcp() {
32+
gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS 2> /dev/stdout 1> /dev/null | (grep -v "Activated service account credentials" || true)
33+
gcloud container clusters get-credentials $CORTEX_CLUSTER_NAME --project $CORTEX_GCP_PROJECT --region $CORTEX_GCP_ZONE 2> /dev/stdout 1> /dev/null | (grep -v "Fetching cluster" | grep -v "kubeconfig entry generated" || true)
34+
35+
if [ "$arg1" != "--keep-volumes" ]; then
36+
uninstall_prometheus
37+
uninstall_grafana
38+
fi
39+
}
40+
41+
function uninstall_aws() {
42+
echo
43+
44+
aws eks --region $CORTEX_REGION update-kubeconfig --name $CORTEX_CLUSTER_NAME >/dev/null
45+
46+
if [ "$arg1" != "--keep-volumes" ]; then
47+
uninstall_prometheus
48+
uninstall_grafana
49+
fi
50+
51+
eksctl delete cluster --wait --name=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --timeout=$EKSCTL_TIMEOUT
52+
echo -e "\n✓ done spinning down the cluster"
53+
}
54+
55+
function uninstall_prometheus() {
56+
kubectl get configmap cluster-config -o jsonpath='{.data.cluster\.yaml}' > ./cluster.yaml
57+
58+
# delete resources to detach disk
59+
python render_template.py ./cluster.yaml manifests/prometheus-monitoring.yaml.j2 | kubectl delete -f - >/dev/null
60+
kubectl delete pvc --namespace default prometheus-prometheus-db-prometheus-prometheus-0 >/dev/null
61+
}
62+
63+
function uninstall_grafana() {
64+
kubectl delete statefulset --namespace default grafana >/dev/null
65+
kubectl delete pvc --namespace default grafana-storage >/dev/null
66+
}
67+
68+
main

0 commit comments

Comments
 (0)