Skip to content

Delete unused prometheus volume on cluster down #1863

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 1, 2021
Merged
10 changes: 9 additions & 1 deletion cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ var (
_flagClusterRegion string
_flagClusterInfoDebug bool
_flagClusterDisallowPrompt bool
_flagClusterDownKeepVolumes bool
_flagAWSAccessKeyID string
_flagAWSSecretAccessKey string
_flagClusterAWSAccessKeyID string
Expand Down Expand Up @@ -97,6 +98,7 @@ func clusterInit() {
addClusterRegionFlag(_clusterDownCmd)
addAWSCredentialsFlags(_clusterDownCmd)
_clusterDownCmd.Flags().BoolVarP(&_flagClusterDisallowPrompt, "yes", "y", false, "skip prompts")
_clusterDownCmd.Flags().BoolVar(&_flagClusterDownKeepVolumes, "keep-volumes", false, "keep cortex provisioned persistent volumes")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This flag might be renamed to --keep-cloud-resources so that it can be used to preserver volumes, buckets, policies and log groups. For now I think it is fine.

_clusterCmd.AddCommand(_clusterDownCmd)

_clusterExportCmd.Flags().SortFlags = false
Expand Down Expand Up @@ -487,7 +489,13 @@ var _clusterDownCmd = &cobra.Command{
}

fmt.Print("○ spinning down the cluster ...")
out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsClient, nil, nil)

uninstallCmd := "/root/uninstall.sh"
if _flagClusterDownKeepVolumes {
uninstallCmd += " --keep-volumes"
}

out, exitCode, err := runManagerAccessCommand(uninstallCmd, *accessConfig, awsClient, nil, nil)
if err != nil {
errors.PrintError(err)
fmt.Println()
Expand Down
42 changes: 34 additions & 8 deletions cli/cmd/cluster_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,15 @@ import (
)

var (
_flagClusterGCPUpEnv string
_flagClusterGCPInfoEnv string
_flagClusterGCPInfoDebug bool
_flagClusterGCPConfig string
_flagClusterGCPName string
_flagClusterGCPZone string
_flagClusterGCPProject string
_flagClusterGCPDisallowPrompt bool
_flagClusterGCPUpEnv string
_flagClusterGCPInfoEnv string
_flagClusterGCPInfoDebug bool
_flagClusterGCPConfig string
_flagClusterGCPName string
_flagClusterGCPZone string
_flagClusterGCPProject string
_flagClusterGCPDisallowPrompt bool
_flagClusterGCPDownKeepVolumes bool
)

func clusterGCPInit() {
Expand All @@ -73,6 +74,7 @@ func clusterGCPInit() {
addClusterGCPProjectFlag(_clusterGCPDownCmd)
addClusterGCPZoneFlag(_clusterGCPDownCmd)
addClusterGCPDisallowPromptFlag(_clusterGCPDownCmd)
_clusterGCPDownCmd.Flags().BoolVar(&_flagClusterGCPDownKeepVolumes, "keep-volumes", false, "keep cortex provisioned persistent volumes")
_clusterGCPCmd.AddCommand(_clusterGCPDownCmd)
}

Expand Down Expand Up @@ -263,6 +265,30 @@ var _clusterGCPDownCmd = &cobra.Command{

fmt.Print("○ spinning down the cluster ")

uninstallCmd := "/root/uninstall.sh"
if _flagClusterGCPDownKeepVolumes {
uninstallCmd += " --keep-volumes"
}
output, exitCode, err := runGCPManagerAccessCommand(uninstallCmd, *accessConfig, nil, nil)
if (exitCode != nil && *exitCode != 0) || err != nil {
if len(output) == 0 {
fmt.Printf("\n")
}
fmt.Print("\n")

gkePvcDiskPrefix := fmt.Sprintf("gke-%s", *accessConfig.ClusterName)
if err != nil {
fmt.Print(fmt.Sprintf("○ failed to delete persistent disks from storage, please visit https://console.cloud.google.com/compute/disks?project=%s to manually delete the disks starting with the %s prefix: %s", *accessConfig.Project, gkePvcDiskPrefix, err.Error()))
telemetry.Error(ErrorClusterDown(err.Error()))
} else {
fmt.Print(fmt.Sprintf("○ failed to delete persistent disks from storage, please visit https://console.cloud.google.com/compute/disks?project=%s to manually delete the disks starting with the %s prefix", *accessConfig.Project, gkePvcDiskPrefix))
telemetry.Error(ErrorClusterDown(output))
}

fmt.Print("\n\n")
fmt.Print("○ proceeding with best-effort deletion of the cluster ")
}

_, err = gcpClient.DeleteCluster(gkeClusterName)
if err != nil {
fmt.Print("\n\n")
Expand Down
17 changes: 6 additions & 11 deletions docs/clusters/aws/uninstall.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,17 @@ aws s3 rb --force s3://<bucket>
aws logs describe-log-groups --log-group-name-prefix=<cluster_name> --query logGroups[*].[logGroupName] --output text | xargs -I {} aws logs delete-log-group --log-group-name {}
```

## Delete Volumes

The volumes used by Cortex's Prometheus and Grafana instances are not deleted by default, as they might contain important
information. If these volumes are not required anymore, you can delete them in the AWS console.

To delete the volumes, navigate to the [EC2 volumes page](https://console.aws.amazon.com/ec2/v2/home?#Volumes)
in the AWS console (be sure to set the appropriate region), select the volumes, click "Actions" and then "Delete Volume"
. Both volumes for Prometheus and Grafana that Cortex created have a name that starts with `kubernetes-dynamic-pvc`,
the `kubernetes.io/cluster/<cluster name>` tag is set to `owned`, and the `kubernetes.io/created-for/pvc/name` tag start
with `prometheus-` and `grafana-` respectively.

## Delete Certificates

If you've configured a custom domain for your APIs, you can remove the SSL Certificate and Hosted Zone for the domain by
following these [instructions](networking/custom-domain.md#cleanup).

## Keep Cortex Volumes

The volumes used by Cortex's Prometheus and Grafana instances are deleted by default on a cluster down operation.
If you want to keep the metrics and dashboards volumes for any reason,
you can pass the `--keep-volumes` flag to the `cortex cluster down` command.

## Troubleshooting

On rare occasions, `cortex cluster down` may not be able to spin down your Cortex cluster. When this happens, follow
Expand Down
11 changes: 4 additions & 7 deletions docs/clusters/gcp/uninstall.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,8 @@ cortex cluster-gcp down
The `cortex cluster-gcp down` command doesn't wait for the cluster to spin down. You can ensure that the cluster has
spun down by checking the GKE console.

## Delete Volumes
## Keep Cortex Volumes

The volumes used by Cortex's Prometheus and Grafana instances are not deleted by default, as they might contain important
information. If these volumes are not required anymore, you can delete them in the GCP console. Navigate to
the [Disks](https://console.cloud.google.com/compute/disks) page (be sure to set the appropriate project), select the
volumes, and click "Delete". The Prometheus and Grafana volumes that Cortex created have a name that starts
with `gke-<cluster name>-`, and the `kubernetes.io/created-for/pvc/name` tag starts with `prometheus-` and `grafana-`
respectively.
The volumes used by Cortex's Prometheus and Grafana instances are deleted by default on a cluster down operation.
If you want to keep the metrics and dashboards volumes for any reason,
you can pass the `--keep-volumes` flag to the `cortex cluster-gcp down` command.
49 changes: 46 additions & 3 deletions manager/uninstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,51 @@ set -e

EKSCTL_TIMEOUT=45m

echo
arg1="$1"

eksctl delete cluster --wait --name=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --timeout=$EKSCTL_TIMEOUT
function main() {
if [ "$CORTEX_PROVIDER" == "aws" ]; then
uninstall_aws
elif [ "$CORTEX_PROVIDER" == "gcp" ]; then
uninstall_gcp
fi
}

echo -e "\n✓ done spinning down the cluster"
function uninstall_gcp() {
gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS 2> /dev/stdout 1> /dev/null | (grep -v "Activated service account credentials" || true)
gcloud container clusters get-credentials $CORTEX_CLUSTER_NAME --project $CORTEX_GCP_PROJECT --region $CORTEX_GCP_ZONE 2> /dev/stdout 1> /dev/null | (grep -v "Fetching cluster" | grep -v "kubeconfig entry generated" || true)

if [ "$arg1" != "--keep-volumes" ]; then
uninstall_prometheus
uninstall_grafana
fi
}

function uninstall_aws() {
echo

aws eks --region $CORTEX_REGION update-kubeconfig --name $CORTEX_CLUSTER_NAME >/dev/null

if [ "$arg1" != "--keep-volumes" ]; then
uninstall_prometheus
uninstall_grafana
fi

eksctl delete cluster --wait --name=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --timeout=$EKSCTL_TIMEOUT
echo -e "\n✓ done spinning down the cluster"
}

function uninstall_prometheus() {
kubectl get configmap cluster-config -o jsonpath='{.data.cluster\.yaml}' > ./cluster.yaml

# delete resources to detach disk
python render_template.py ./cluster.yaml manifests/prometheus-monitoring.yaml.j2 | kubectl delete -f - >/dev/null
kubectl delete pvc --namespace default prometheus-prometheus-db-prometheus-prometheus-0 >/dev/null
}

function uninstall_grafana() {
kubectl delete statefulset --namespace default grafana >/dev/null
kubectl delete pvc --namespace default grafana-storage >/dev/null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a scenario where cluster-down fails, it may not be possible to get kube config to delete the volumes via kubectl. We should use cloud specific apis to find and delete volumes without having to rely on kubectl.

}

main