diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index b1e02340ec..5a345de342 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -416,6 +416,7 @@ func (p *csiProvisioner) Provision(options controller.VolumeOptions) (*v1.Persis p.client, p.csiAPIClient, driverState.driverName, + options.PVC.Name, options.AllowedTopologies, options.SelectedNode) if err != nil { diff --git a/pkg/controller/topology.go b/pkg/controller/topology.go index d7e3fac778..6a763970e4 100644 --- a/pkg/controller/topology.go +++ b/pkg/controller/topology.go @@ -20,6 +20,7 @@ import ( "fmt" "github.com/container-storage-interface/spec/lib/go/csi/v0" "github.com/golang/glog" + "hash/fnv" "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -27,6 +28,7 @@ import ( csiclientset "k8s.io/csi-api/pkg/client/clientset/versioned" "math/rand" "sort" + "strconv" "strings" ) @@ -68,6 +70,7 @@ func GenerateAccessibilityRequirements( kubeClient kubernetes.Interface, csiAPIClient csiclientset.Interface, driverName string, + pvcName string, allowedTopologies []v1.TopologySelectorTerm, selectedNode *v1.Node) (*csi.TopologyRequirement, error) { requirement := &csi.TopologyRequirement{} @@ -96,7 +99,15 @@ func GenerateAccessibilityRequirements( requirement.Requisite = toCSITopology(requisiteTerms) /* Preferred */ - if selectedNode != nil { + var preferredTerms []topologyTerm + if selectedNode == nil { + // no node selected therefore ensure even spreading of StatefulSet volumes by sorting + // requisiteTerms and shifting the sorted terms based on hash of pvcName and replica index suffix + hash, index := getPVCNameHashAndIndexOffset(pvcName) + i := (hash + index) % uint32(len(requisiteTerms)) + preferredTerms = sortAndShift(requisiteTerms, nil, i) + } else { + // selectedNode is set so use topology from that node to populate preferredTerms // TODO (verult) reuse selected node info from aggregateTopologies // TODO (verult) retry nodeInfo, err := csiAPIClient.CsiV1alpha1().CSINodeInfos().Get(selectedNode.Name, metav1.GetOptions{}) @@ -110,7 +121,7 @@ func GenerateAccessibilityRequirements( return nil, fmt.Errorf("topology labels from selected node %v does not match topology keys from CSINodeInfo %v", selectedNode.Labels, topologyKeys) } - preferredTerms := sortAndShift(requisiteTerms, selectedTopology) + preferredTerms = sortAndShift(requisiteTerms, selectedTopology, 0) if preferredTerms == nil { // Topology from selected node is not in requisite. This case should never be hit: // - If AllowedTopologies is specified, the scheduler should choose a node satisfying the @@ -119,10 +130,8 @@ func GenerateAccessibilityRequirements( // selected node. return nil, fmt.Errorf("topology %v from selected node %q is not in requisite", selectedTopology, selectedNode.Name) } - - requirement.Preferred = toCSITopology(preferredTerms) } - + requirement.Preferred = toCSITopology(preferredTerms) return requirement, nil } @@ -267,17 +276,21 @@ func deduplicate(terms []topologyTerm) []topologyTerm { } // Sort the given terms in place, -// then return a new list of terms equivalent to the sorted terms, but shifted so that the primary -// term is the first in the list. -func sortAndShift(terms []topologyTerm, primary topologyTerm) []topologyTerm { +// then return a new list of terms equivalent to the sorted terms, but shifted so that +// either the primary term (if specified) or term at shiftIndex is the first in the list. +func sortAndShift(terms []topologyTerm, primary topologyTerm, shiftIndex uint32) []topologyTerm { var preferredTerms []topologyTerm sort.Slice(terms, func(i, j int) bool { return terms[i].less(terms[j]) }) - for i, t := range terms { - if t.equal(primary) { - preferredTerms = append(terms[i:], terms[:i]...) - break + if primary == nil { + preferredTerms = append(terms[shiftIndex:], terms[:shiftIndex]...) + } else { + for i, t := range terms { + if t.equal(primary) { + preferredTerms = append(terms[i:], terms[:i]...) + break + } } } return preferredTerms @@ -367,3 +380,53 @@ func toCSITopology(terms []topologyTerm) []*csi.Topology { } return out } + +// identical to logic in getPVCNameHashAndIndexOffset in pkg/volume/util/util.go in-tree +// [https://github.com/kubernetes/kubernetes/blob/master/pkg/volume/util/util.go] +func getPVCNameHashAndIndexOffset(pvcName string) (hash uint32, index uint32) { + if pvcName == "" { + // We should always be called with a name; this shouldn't happen + hash = rand.Uint32() + } else { + hashString := pvcName + + // Heuristic to make sure that volumes in a StatefulSet are spread across zones + // StatefulSet PVCs are (currently) named ClaimName-StatefulSetName-Id, + // where Id is an integer index. + // Note though that if a StatefulSet pod has multiple claims, we need them to be + // in the same zone, because otherwise the pod will be unable to mount both volumes, + // and will be unschedulable. So we hash _only_ the "StatefulSetName" portion when + // it looks like `ClaimName-StatefulSetName-Id`. + // We continue to round-robin volume names that look like `Name-Id` also; this is a useful + // feature for users that are creating statefulset-like functionality without using statefulsets. + lastDash := strings.LastIndexByte(pvcName, '-') + if lastDash != -1 { + statefulsetIDString := pvcName[lastDash+1:] + statefulsetID, err := strconv.ParseUint(statefulsetIDString, 10, 32) + if err == nil { + // Offset by the statefulsetID, so we round-robin across zones + index = uint32(statefulsetID) + // We still hash the volume name, but only the prefix + hashString = pvcName[:lastDash] + + // In the special case where it looks like `ClaimName-StatefulSetName-Id`, + // hash only the StatefulSetName, so that different claims on the same StatefulSet + // member end up in the same zone. + // Note that StatefulSetName (and ClaimName) might themselves both have dashes. + // We actually just take the portion after the final - of ClaimName-StatefulSetName. + // For our purposes it doesn't much matter (just suboptimal spreading). + lastDash := strings.LastIndexByte(hashString, '-') + if lastDash != -1 { + hashString = hashString[lastDash+1:] + } + } + } + + // We hash the (base) volume name, so we don't bias towards the first N zones + h := fnv.New32() + h.Write([]byte(hashString)) + hash = h.Sum32() + } + + return hash, index +} diff --git a/pkg/controller/topology_test.go b/pkg/controller/topology_test.go index 30046d5279..6f64112a54 100644 --- a/pkg/controller/topology_test.go +++ b/pkg/controller/topology_test.go @@ -146,6 +146,279 @@ func TestGenerateVolumeNodeAffinity(t *testing.T) { } } +func TestStatefulSetSpreading(t *testing.T) { + nodeLabels := []map[string]string{ + {"com.example.csi/zone": "zone1", "com.example.csi/rack": "rackA"}, + {"com.example.csi/zone": "zone2", "com.example.csi/rack": "rackB"}, + {"com.example.csi/zone": "zone3", "com.example.csi/rack": "rackC"}, + {"com.example.csi/zone": "zone4", "com.example.csi/rack": "rackD"}, + } + var topologyKeys []map[string][]string + keys := map[string][]string{testDriverName: {"com.example.csi/zone", "com.example.csi/rack"}} + for i := 0; i < len(nodeLabels); i++ { + topologyKeys = append(topologyKeys, keys) + } + // Ordering of segments in preferred array is sensitive to statefulset name portion of pvcName. + // In the tests below, name of the statefulset: testset is the portion whose hash determines ordering. + // If statefulset name is changed, make sure expectedPreferred is kept in sync. + // pvc prefix in pvcName does not have any effect on segment ordering + testcases := map[string]struct { + pvcName string + allowedTopologies []v1.TopologySelectorTerm + expectedPreferred []*csi.Topology + }{ + "select index 0 among nodes for pvc with statefulset name:testset and id:1; ignore claimname:testpvcA": { + pvcName: "testpvcA-testset-1", + expectedPreferred: []*csi.Topology{ + { + Segments: map[string]string{ + "com.example.csi/rack": "rackA", + "com.example.csi/zone": "zone1", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackB", + "com.example.csi/zone": "zone2", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackC", + "com.example.csi/zone": "zone3", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackD", + "com.example.csi/zone": "zone4", + }, + }, + }, + }, + "select index 0 among nodes for pvc with statefulset name:testset and id:1; ignore claimname:testpvcB": { + pvcName: "testpvcB-testset-1", + expectedPreferred: []*csi.Topology{ + { + Segments: map[string]string{ + "com.example.csi/rack": "rackA", + "com.example.csi/zone": "zone1", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackB", + "com.example.csi/zone": "zone2", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackC", + "com.example.csi/zone": "zone3", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackD", + "com.example.csi/zone": "zone4", + }, + }, + }, + }, + "select index 0 among allowedTopologies with single term/multiple requirements for pvc with statefulset name:testset and id:1; ignore claimname:testpvcC": { + pvcName: "testpvcC-testset-1", + allowedTopologies: []v1.TopologySelectorTerm{ + { + MatchLabelExpressions: []v1.TopologySelectorLabelRequirement{ + { + Key: "com.example.csi/zone", + Values: []string{"zone1"}, + }, + { + Key: "com.example.csi/rack", + Values: []string{"rackA"}, + }, + }, + }, + }, + expectedPreferred: []*csi.Topology{ + { + Segments: map[string]string{ + "com.example.csi/rack": "rackA", + "com.example.csi/zone": "zone1", + }, + }, + }, + }, + "select index 1 among nodes for pvc with statefulset name:testset and id:2": { + pvcName: "testset-2", + expectedPreferred: []*csi.Topology{ + { + Segments: map[string]string{ + "com.example.csi/rack": "rackB", + "com.example.csi/zone": "zone2", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackC", + "com.example.csi/zone": "zone3", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackD", + "com.example.csi/zone": "zone4", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackA", + "com.example.csi/zone": "zone1", + }, + }, + }, + }, + "select index 1 among allowedTopologies with multiple terms/multiple requirments for pvc with statefulset name:testset and id:2; ignore claimname:testpvcB": { + pvcName: "testpvcB-testset-2", + allowedTopologies: []v1.TopologySelectorTerm{ + { + MatchLabelExpressions: []v1.TopologySelectorLabelRequirement{ + { + Key: "com.example.csi/zone", + Values: []string{"zone2"}, + }, + { + Key: "com.example.csi/rack", + Values: []string{"rackB"}, + }, + }, + }, + { + MatchLabelExpressions: []v1.TopologySelectorLabelRequirement{ + { + Key: "com.example.csi/zone", + Values: []string{"zone1"}, + }, + { + Key: "com.example.csi/rack", + Values: []string{"rackA"}, + }, + }, + }, + }, + expectedPreferred: []*csi.Topology{ + { + Segments: map[string]string{ + "com.example.csi/rack": "rackB", + "com.example.csi/zone": "zone2", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackA", + "com.example.csi/zone": "zone1", + }, + }, + }, + }, + "select index 2 among nodes with statefulset name:testset and id:3; ignore claimname:testpvc": { + pvcName: "testpvc-testset-3", + expectedPreferred: []*csi.Topology{ + { + Segments: map[string]string{ + "com.example.csi/rack": "rackC", + "com.example.csi/zone": "zone3", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackD", + "com.example.csi/zone": "zone4", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackA", + "com.example.csi/zone": "zone1", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackB", + "com.example.csi/zone": "zone2", + }, + }, + }, + }, + "select index 3 among nodes with statefulset name:testset and id:4; ignore claimname:testpvc": { + pvcName: "testpvc-testset-4", + expectedPreferred: []*csi.Topology{ + { + Segments: map[string]string{ + "com.example.csi/rack": "rackD", + "com.example.csi/zone": "zone4", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackA", + "com.example.csi/zone": "zone1", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackB", + "com.example.csi/zone": "zone2", + }, + }, + { + Segments: map[string]string{ + "com.example.csi/rack": "rackC", + "com.example.csi/zone": "zone3", + }, + }, + }, + }, + } + + nodes := buildNodes(nodeLabels) + nodeInfos := buildNodeInfos(topologyKeys) + + kubeClient := fakeclientset.NewSimpleClientset(nodes) + csiClient := fakecsiclientset.NewSimpleClientset(nodeInfos) + + for name, tc := range testcases { + t.Logf("test: %s", name) + + requirements, err := GenerateAccessibilityRequirements( + kubeClient, + csiClient, + testDriverName, + tc.pvcName, + tc.allowedTopologies, + nil, + ) + + if err != nil { + t.Errorf("unexpected error found: %v", err) + continue + } + + if requirements == nil { + t.Errorf("expected preferred to be %v but requirements is nil", tc.expectedPreferred) + continue + } + if requirements.Preferred == nil { + t.Errorf("expected preferred to be %v but requirements.Preferred is nil", tc.expectedPreferred) + continue + } + if !helper.Semantic.DeepEqual(requirements.Preferred, tc.expectedPreferred) { + t.Errorf("expected preferred requisite %v; got: %v", tc.expectedPreferred, requirements.Preferred) + } + } +} + func TestAllowedTopologies(t *testing.T) { // TODO (verult) more AllowedTopologies unit tests testcases := map[string]struct { @@ -510,6 +783,7 @@ func TestAllowedTopologies(t *testing.T) { nil, /* kubeClient */ nil, /* csiAPIClient */ "test-driver", /* driverName */ + "testpvc", tc.allowedTopologies, nil /* selectedNode */) @@ -685,6 +959,7 @@ func TestTopologyAggregation(t *testing.T) { kubeClient, csiClient, testDriverName, + "testpvc", nil, /* allowedTopologies */ selectedNode, ) @@ -853,6 +1128,7 @@ func TestPreferredTopologies(t *testing.T) { kubeClient, csiClient, testDriverName, + "testpvc", tc.allowedTopologies, selectedNode, )