Skip to content

Commit e35af78

Browse files
committed
Add fault domain check in kubectl fdb analyze command
1 parent b1e744e commit e35af78

File tree

4 files changed

+575
-52
lines changed

4 files changed

+575
-52
lines changed

kubectl-fdb/cmd/analyze.go

Lines changed: 160 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,24 +21,24 @@
2121
package cmd
2222

2323
import (
24+
"context"
2425
"encoding/json"
2526
"errors"
2627
"fmt"
28+
"sort"
29+
"strconv"
2730
"strings"
2831
"time"
2932

33+
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2"
3034
kubeHelper "github.com/FoundationDB/fdb-kubernetes-operator/v2/internal/kubernetes"
3135
"github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbstatus"
32-
33-
"k8s.io/client-go/rest"
34-
35-
"context"
36-
37-
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2"
3836
"github.com/fatih/color"
3937
"github.com/spf13/cobra"
4038
corev1 "k8s.io/api/core/v1"
4139
"k8s.io/cli-runtime/pkg/genericclioptions"
40+
"k8s.io/client-go/rest"
41+
"k8s.io/utils/ptr"
4242
"sigs.k8s.io/controller-runtime/pkg/client"
4343
)
4444

@@ -112,42 +112,34 @@ func newAnalyzeCmd(streams genericclioptions.IOStreams) *cobra.Command {
112112
return err
113113
}
114114

115-
// TODO (jscheuermann): Don't load clusters twice if we check all clusters
116-
var clusters []string
115+
var clusters []*fdbv1beta2.FoundationDBCluster
117116
if allClusters {
118-
var clusterList fdbv1beta2.FoundationDBClusterList
117+
clusterList := &fdbv1beta2.FoundationDBClusterList{}
119118
err := kubeClient.List(
120119
context.Background(),
121-
&clusterList,
120+
clusterList,
122121
client.InNamespace(namespace),
123122
)
124123
if err != nil {
125124
return err
126125
}
127126

128127
for _, cluster := range clusterList.Items {
129-
clusters = append(clusters, cluster.Name)
128+
clusters = append(clusters, ptr.To(cluster))
130129
}
131130
} else {
132-
clusters = args
133-
}
131+
for _, clusterName := range args {
132+
cluster, err := loadCluster(kubeClient, namespace, clusterName)
133+
if err != nil {
134+
return err
135+
}
134136

135-
var errs []error
136-
for _, clusterName := range clusters {
137-
cluster, err := loadCluster(kubeClient, namespace, clusterName)
138-
if err != nil {
139-
errs = append(
140-
errs,
141-
fmt.Errorf(
142-
"could not fetch cluster information for: %s/%s, error: %w",
143-
namespace,
144-
clusterName,
145-
err,
146-
),
147-
)
148-
continue
137+
clusters = append(clusters, cluster)
149138
}
139+
}
150140

141+
var errs []error
142+
for _, cluster := range clusters {
151143
err = analyzeCluster(
152144
cmd,
153145
kubeClient,
@@ -466,6 +458,12 @@ func analyzeCluster(
466458
foundIssues = true
467459
}
468460

461+
// Check if the fault domain distribution is fine and print out how many fault domains are used by process class.
462+
if !faultDomainDistributionIsValid(cmd, cluster) {
463+
foundIssues = true
464+
}
465+
cmd.Println("")
466+
469467
// We could add more auto fixes in the future.
470468
if autoFix {
471469
confirmed := false
@@ -521,6 +519,7 @@ func analyzeCluster(
521519
return fmt.Errorf("found issues for cluster %s. Please check them", cluster.Name)
522520
}
523521

522+
cmd.Println("")
524523
return nil
525524
}
526525

@@ -691,3 +690,137 @@ func analyzeStatusInternal(
691690

692691
return nil
693692
}
693+
694+
// FaultDomainSummary represents the fault domain distribution by process class
695+
type FaultDomainSummary map[fdbv1beta2.ProcessClass]map[fdbv1beta2.FaultDomain]int
696+
697+
// generateFaultDomainSummary analyzes the process groups and returns a summary of fault domain distribution by process class
698+
func generateFaultDomainSummary(cluster *fdbv1beta2.FoundationDBCluster) FaultDomainSummary {
699+
summary := make(FaultDomainSummary)
700+
701+
for _, processGroup := range cluster.Status.ProcessGroups {
702+
// Skip process groups marked for removal
703+
if processGroup.IsMarkedForRemoval() {
704+
continue
705+
}
706+
707+
// Ignore process groups that have never been scheduled.
708+
if processGroup.FaultDomain == "" {
709+
continue
710+
}
711+
712+
// Initialize nested map if needed
713+
if _, exists := summary[processGroup.ProcessClass]; !exists {
714+
summary[processGroup.ProcessClass] = make(map[fdbv1beta2.FaultDomain]int)
715+
}
716+
717+
// Increment the count for this process class and fault domain combination
718+
summary[processGroup.ProcessClass][processGroup.FaultDomain]++
719+
}
720+
721+
return summary
722+
}
723+
724+
// faultDomainDistributionIsValid prints a formatted summary of fault domain distribution by process class and
725+
// returns false if the current unique fault domains are less than the required minimum fault domains.
726+
func faultDomainDistributionIsValid(
727+
cmd *cobra.Command,
728+
cluster *fdbv1beta2.FoundationDBCluster,
729+
) bool {
730+
summary := generateFaultDomainSummary(cluster)
731+
if len(summary) == 0 {
732+
printStatement(cmd, "Could not fetch fault domain information for cluster", warnMessage)
733+
return false
734+
}
735+
736+
cmd.Println("Fault Domain Summary for cluster:")
737+
isValid := true
738+
739+
// Ensure we get a stable result. Iterating over maps can produce different results.
740+
processClassKeys := make([]fdbv1beta2.ProcessClass, 0, len(summary))
741+
for processClass := range summary {
742+
processClassKeys = append(processClassKeys, processClass)
743+
}
744+
sort.SliceStable(processClassKeys, func(i, j int) bool {
745+
return processClassKeys[i] > processClassKeys[j]
746+
})
747+
748+
// Print summary for each process class
749+
for _, processClass := range processClassKeys {
750+
faultDomains := summary[processClass]
751+
if len(faultDomains) == 0 {
752+
continue
753+
}
754+
755+
// Build the fault domain distribution string
756+
totalProcessGroups := 0
757+
758+
for faultDomain, count := range faultDomains {
759+
if faultDomain != "" {
760+
totalProcessGroups += count
761+
}
762+
}
763+
764+
var sb strings.Builder
765+
sb.WriteString(string(processClass))
766+
sb.WriteString(": Total: ")
767+
sb.WriteString(strconv.Itoa(len(faultDomains)))
768+
sb.WriteString(" fault domains, ")
769+
sb.WriteString(strconv.Itoa(totalProcessGroups))
770+
sb.WriteString(" process groups")
771+
772+
if len(faultDomains) != totalProcessGroups {
773+
sb.WriteString(printTopNFaultDomains(faultDomains, 3))
774+
}
775+
776+
minimumFaultDomains := fdbv1beta2.MinimumFaultDomains(
777+
cluster.Spec.DatabaseConfiguration.RedundancyMode,
778+
)
779+
var msgType messageType
780+
if len(faultDomains) > minimumFaultDomains {
781+
msgType = goodMessage
782+
} else if len(faultDomains) == minimumFaultDomains {
783+
msgType = warnMessage
784+
} else {
785+
msgType = errorMessage
786+
isValid = false
787+
}
788+
789+
printStatement(cmd, sb.String(), msgType)
790+
}
791+
792+
return isValid
793+
}
794+
795+
// printTopNFaultDomains prints the top N fault domains.
796+
func printTopNFaultDomains(faultDomains map[fdbv1beta2.FaultDomain]int, n int) string {
797+
// Create slice of key-value pairs for sorting.
798+
type kv struct {
799+
key fdbv1beta2.FaultDomain
800+
value int
801+
}
802+
803+
pairs := make([]kv, 0, len(faultDomains))
804+
for k, v := range faultDomains {
805+
pairs = append(pairs, kv{k, v})
806+
}
807+
808+
// Sort by value descending
809+
sort.SliceStable(pairs, func(i, j int) bool {
810+
return pairs[i].value > pairs[j].value
811+
})
812+
813+
var sb strings.Builder
814+
sb.WriteString(" top ")
815+
sb.WriteString(strconv.Itoa(n))
816+
sb.WriteString(" fault domains:")
817+
// Print top n elements
818+
for i := 0; i < n && i < len(pairs); i++ {
819+
sb.WriteString(" ")
820+
sb.WriteString(string(pairs[i].key))
821+
sb.WriteString(": ")
822+
sb.WriteString(strconv.Itoa(pairs[i].value))
823+
}
824+
825+
return sb.String()
826+
}

0 commit comments

Comments
 (0)