|
21 | 21 | package cmd
|
22 | 22 |
|
23 | 23 | import (
|
| 24 | + "context" |
24 | 25 | "encoding/json"
|
25 | 26 | "errors"
|
26 | 27 | "fmt"
|
| 28 | + "sort" |
| 29 | + "strconv" |
27 | 30 | "strings"
|
28 | 31 | "time"
|
29 | 32 |
|
| 33 | + fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2" |
30 | 34 | kubeHelper "github.com/FoundationDB/fdb-kubernetes-operator/v2/internal/kubernetes"
|
31 | 35 | "github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbstatus"
|
32 |
| - |
33 |
| - "k8s.io/client-go/rest" |
34 |
| - |
35 |
| - "context" |
36 |
| - |
37 |
| - fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2" |
38 | 36 | "github.com/fatih/color"
|
39 | 37 | "github.com/spf13/cobra"
|
40 | 38 | corev1 "k8s.io/api/core/v1"
|
41 | 39 | "k8s.io/cli-runtime/pkg/genericclioptions"
|
| 40 | + "k8s.io/client-go/rest" |
| 41 | + "k8s.io/utils/ptr" |
42 | 42 | "sigs.k8s.io/controller-runtime/pkg/client"
|
43 | 43 | )
|
44 | 44 |
|
@@ -112,42 +112,34 @@ func newAnalyzeCmd(streams genericclioptions.IOStreams) *cobra.Command {
|
112 | 112 | return err
|
113 | 113 | }
|
114 | 114 |
|
115 |
| - // TODO (jscheuermann): Don't load clusters twice if we check all clusters |
116 |
| - var clusters []string |
| 115 | + var clusters []*fdbv1beta2.FoundationDBCluster |
117 | 116 | if allClusters {
|
118 |
| - var clusterList fdbv1beta2.FoundationDBClusterList |
| 117 | + clusterList := &fdbv1beta2.FoundationDBClusterList{} |
119 | 118 | err := kubeClient.List(
|
120 | 119 | context.Background(),
|
121 |
| - &clusterList, |
| 120 | + clusterList, |
122 | 121 | client.InNamespace(namespace),
|
123 | 122 | )
|
124 | 123 | if err != nil {
|
125 | 124 | return err
|
126 | 125 | }
|
127 | 126 |
|
128 | 127 | for _, cluster := range clusterList.Items {
|
129 |
| - clusters = append(clusters, cluster.Name) |
| 128 | + clusters = append(clusters, ptr.To(cluster)) |
130 | 129 | }
|
131 | 130 | } else {
|
132 |
| - clusters = args |
133 |
| - } |
| 131 | + for _, clusterName := range args { |
| 132 | + cluster, err := loadCluster(kubeClient, namespace, clusterName) |
| 133 | + if err != nil { |
| 134 | + return err |
| 135 | + } |
134 | 136 |
|
135 |
| - var errs []error |
136 |
| - for _, clusterName := range clusters { |
137 |
| - cluster, err := loadCluster(kubeClient, namespace, clusterName) |
138 |
| - if err != nil { |
139 |
| - errs = append( |
140 |
| - errs, |
141 |
| - fmt.Errorf( |
142 |
| - "could not fetch cluster information for: %s/%s, error: %w", |
143 |
| - namespace, |
144 |
| - clusterName, |
145 |
| - err, |
146 |
| - ), |
147 |
| - ) |
148 |
| - continue |
| 137 | + clusters = append(clusters, cluster) |
149 | 138 | }
|
| 139 | + } |
150 | 140 |
|
| 141 | + var errs []error |
| 142 | + for _, cluster := range clusters { |
151 | 143 | err = analyzeCluster(
|
152 | 144 | cmd,
|
153 | 145 | kubeClient,
|
@@ -466,6 +458,12 @@ func analyzeCluster(
|
466 | 458 | foundIssues = true
|
467 | 459 | }
|
468 | 460 |
|
| 461 | + // Check if the fault domain distribution is fine and print out how many fault domains are used by process class. |
| 462 | + if !faultDomainDistributionIsValid(cmd, cluster) { |
| 463 | + foundIssues = true |
| 464 | + } |
| 465 | + cmd.Println("") |
| 466 | + |
469 | 467 | // We could add more auto fixes in the future.
|
470 | 468 | if autoFix {
|
471 | 469 | confirmed := false
|
@@ -521,6 +519,7 @@ func analyzeCluster(
|
521 | 519 | return fmt.Errorf("found issues for cluster %s. Please check them", cluster.Name)
|
522 | 520 | }
|
523 | 521 |
|
| 522 | + cmd.Println("") |
524 | 523 | return nil
|
525 | 524 | }
|
526 | 525 |
|
@@ -691,3 +690,137 @@ func analyzeStatusInternal(
|
691 | 690 |
|
692 | 691 | return nil
|
693 | 692 | }
|
| 693 | + |
| 694 | +// FaultDomainSummary represents the fault domain distribution by process class |
| 695 | +type FaultDomainSummary map[fdbv1beta2.ProcessClass]map[fdbv1beta2.FaultDomain]int |
| 696 | + |
| 697 | +// generateFaultDomainSummary analyzes the process groups and returns a summary of fault domain distribution by process class |
| 698 | +func generateFaultDomainSummary(cluster *fdbv1beta2.FoundationDBCluster) FaultDomainSummary { |
| 699 | + summary := make(FaultDomainSummary) |
| 700 | + |
| 701 | + for _, processGroup := range cluster.Status.ProcessGroups { |
| 702 | + // Skip process groups marked for removal |
| 703 | + if processGroup.IsMarkedForRemoval() { |
| 704 | + continue |
| 705 | + } |
| 706 | + |
| 707 | + // Ignore process groups that have never been scheduled. |
| 708 | + if processGroup.FaultDomain == "" { |
| 709 | + continue |
| 710 | + } |
| 711 | + |
| 712 | + // Initialize nested map if needed |
| 713 | + if _, exists := summary[processGroup.ProcessClass]; !exists { |
| 714 | + summary[processGroup.ProcessClass] = make(map[fdbv1beta2.FaultDomain]int) |
| 715 | + } |
| 716 | + |
| 717 | + // Increment the count for this process class and fault domain combination |
| 718 | + summary[processGroup.ProcessClass][processGroup.FaultDomain]++ |
| 719 | + } |
| 720 | + |
| 721 | + return summary |
| 722 | +} |
| 723 | + |
| 724 | +// faultDomainDistributionIsValid prints a formatted summary of fault domain distribution by process class and |
| 725 | +// returns false if the current unique fault domains are less than the required minimum fault domains. |
| 726 | +func faultDomainDistributionIsValid( |
| 727 | + cmd *cobra.Command, |
| 728 | + cluster *fdbv1beta2.FoundationDBCluster, |
| 729 | +) bool { |
| 730 | + summary := generateFaultDomainSummary(cluster) |
| 731 | + if len(summary) == 0 { |
| 732 | + printStatement(cmd, "Could not fetch fault domain information for cluster", warnMessage) |
| 733 | + return false |
| 734 | + } |
| 735 | + |
| 736 | + cmd.Println("Fault Domain Summary for cluster:") |
| 737 | + isValid := true |
| 738 | + |
| 739 | + // Ensure we get a stable result. Iterating over maps can produce different results. |
| 740 | + processClassKeys := make([]fdbv1beta2.ProcessClass, 0, len(summary)) |
| 741 | + for processClass := range summary { |
| 742 | + processClassKeys = append(processClassKeys, processClass) |
| 743 | + } |
| 744 | + sort.SliceStable(processClassKeys, func(i, j int) bool { |
| 745 | + return processClassKeys[i] > processClassKeys[j] |
| 746 | + }) |
| 747 | + |
| 748 | + // Print summary for each process class |
| 749 | + for _, processClass := range processClassKeys { |
| 750 | + faultDomains := summary[processClass] |
| 751 | + if len(faultDomains) == 0 { |
| 752 | + continue |
| 753 | + } |
| 754 | + |
| 755 | + // Build the fault domain distribution string |
| 756 | + totalProcessGroups := 0 |
| 757 | + |
| 758 | + for faultDomain, count := range faultDomains { |
| 759 | + if faultDomain != "" { |
| 760 | + totalProcessGroups += count |
| 761 | + } |
| 762 | + } |
| 763 | + |
| 764 | + var sb strings.Builder |
| 765 | + sb.WriteString(string(processClass)) |
| 766 | + sb.WriteString(": Total: ") |
| 767 | + sb.WriteString(strconv.Itoa(len(faultDomains))) |
| 768 | + sb.WriteString(" fault domains, ") |
| 769 | + sb.WriteString(strconv.Itoa(totalProcessGroups)) |
| 770 | + sb.WriteString(" process groups") |
| 771 | + |
| 772 | + if len(faultDomains) != totalProcessGroups { |
| 773 | + sb.WriteString(printTopNFaultDomains(faultDomains, 3)) |
| 774 | + } |
| 775 | + |
| 776 | + minimumFaultDomains := fdbv1beta2.MinimumFaultDomains( |
| 777 | + cluster.Spec.DatabaseConfiguration.RedundancyMode, |
| 778 | + ) |
| 779 | + var msgType messageType |
| 780 | + if len(faultDomains) > minimumFaultDomains { |
| 781 | + msgType = goodMessage |
| 782 | + } else if len(faultDomains) == minimumFaultDomains { |
| 783 | + msgType = warnMessage |
| 784 | + } else { |
| 785 | + msgType = errorMessage |
| 786 | + isValid = false |
| 787 | + } |
| 788 | + |
| 789 | + printStatement(cmd, sb.String(), msgType) |
| 790 | + } |
| 791 | + |
| 792 | + return isValid |
| 793 | +} |
| 794 | + |
| 795 | +// printTopNFaultDomains prints the top N fault domains. |
| 796 | +func printTopNFaultDomains(faultDomains map[fdbv1beta2.FaultDomain]int, n int) string { |
| 797 | + // Create slice of key-value pairs for sorting. |
| 798 | + type kv struct { |
| 799 | + key fdbv1beta2.FaultDomain |
| 800 | + value int |
| 801 | + } |
| 802 | + |
| 803 | + pairs := make([]kv, 0, len(faultDomains)) |
| 804 | + for k, v := range faultDomains { |
| 805 | + pairs = append(pairs, kv{k, v}) |
| 806 | + } |
| 807 | + |
| 808 | + // Sort by value descending |
| 809 | + sort.SliceStable(pairs, func(i, j int) bool { |
| 810 | + return pairs[i].value > pairs[j].value |
| 811 | + }) |
| 812 | + |
| 813 | + var sb strings.Builder |
| 814 | + sb.WriteString(" top ") |
| 815 | + sb.WriteString(strconv.Itoa(n)) |
| 816 | + sb.WriteString(" fault domains:") |
| 817 | + // Print top n elements |
| 818 | + for i := 0; i < n && i < len(pairs); i++ { |
| 819 | + sb.WriteString(" ") |
| 820 | + sb.WriteString(string(pairs[i].key)) |
| 821 | + sb.WriteString(": ") |
| 822 | + sb.WriteString(strconv.Itoa(pairs[i].value)) |
| 823 | + } |
| 824 | + |
| 825 | + return sb.String() |
| 826 | +} |
0 commit comments