Skip to content

Commit

Permalink
Improved management of status for IndexerCluster and SearchHeadCluster
Browse files Browse the repository at this point in the history
Added doc example on using a Horizonal Pod Autoscaler
  • Loading branch information
mikedickey committed Mar 20, 2020
1 parent 9a760bb commit 78b6874
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 48 deletions.
26 changes: 26 additions & 0 deletions docs/Examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,32 @@ $ kubectl scale idc example --replicas=5
indexercluster.enterprise.splunk.com/example scaled
```

You can also create [Horizontal Pod Autoscalers](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
to manage scaling for you. For example:

```yaml
cat <<EOF | kubectl apply -f -
apiVersion: autoscaling/v1
kind: HorizontalPodAutoscaler
metadata:
name: idc-example
spec:
scaleTargetRef:
apiVersion: enterprise.splunk.com/v1alpha2
kind: IndexerCluster
name: example
minReplicas: 5
maxReplicas: 10
targetCPUUtilizationPercentage: 50
EOF
```

```
$ kubectl get hpa
NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS AGE
idc-example IndexerCluster/example 16%/50% 5 10 5 15m
```

To create a standalone search head that uses your indexer cluster, all you
have to do is add an `indexerClusterRef` parameter:

Expand Down
20 changes: 17 additions & 3 deletions pkg/splunk/reconcile/indexercluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,14 @@ func ApplyIndexerCluster(client ControllerClient, cr *enterprisev1.IndexerCluste
cr.Status.ClusterMasterPhase = enterprisev1.PhaseError
cr.Status.Replicas = cr.Spec.Replicas
cr.Status.Selector = fmt.Sprintf("app.kubernetes.io/instance=splunk-%s-indexer", cr.GetIdentifier())
if cr.Status.Peers == nil {
cr.Status.Peers = []enterprisev1.IndexerClusterMemberStatus{}
}
defer func() {
client.Status().Update(context.TODO(), cr)
err = client.Status().Update(context.TODO(), cr)
if err != nil {
scopedLog.Error(err, "Status update failed")
}
}()

// check if deletion has been requested
Expand Down Expand Up @@ -227,7 +233,6 @@ func (mgr *IndexerClusterPodManager) getClusterMasterClient() *splclient.SplunkC
// updateStatus for IndexerClusterPodManager uses the REST API to update the status for a SearcHead custom resource
func (mgr *IndexerClusterPodManager) updateStatus(statefulSet *appsv1.StatefulSet) error {
mgr.cr.Status.ReadyReplicas = statefulSet.Status.ReadyReplicas
mgr.cr.Status.Peers = []enterprisev1.IndexerClusterMemberStatus{}

if mgr.cr.Status.ClusterMasterPhase != enterprisev1.PhaseReady {
mgr.cr.Status.Initialized = false
Expand Down Expand Up @@ -266,7 +271,16 @@ func (mgr *IndexerClusterPodManager) updateStatus(statefulSet *appsv1.StatefulSe
} else {
mgr.log.Info("Peer is not known by cluster master", "peerName", peerName)
}
mgr.cr.Status.Peers = append(mgr.cr.Status.Peers, peerStatus)
if n < int32(len(mgr.cr.Status.Peers)) {
mgr.cr.Status.Peers[n] = peerStatus
} else {
mgr.cr.Status.Peers = append(mgr.cr.Status.Peers, peerStatus)
}
}

// truncate any extra peers that we didn't check (leftover from scale down)
if statefulSet.Status.Replicas < int32(len(mgr.cr.Status.Peers)) {
mgr.cr.Status.Peers = mgr.cr.Status.Peers[:statefulSet.Status.Replicas]
}

return nil
Expand Down
44 changes: 34 additions & 10 deletions pkg/splunk/reconcile/indexercluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,20 @@ func TestIndexerClusterPodManager(t *testing.T) {

// test 1 ready pod
mockHandlers := []spltest.MockHTTPHandler{
{"GET", "https://splunk-stack1-cluster-master-service.test.svc.cluster.local:8089/services/cluster/master/info?count=0&output_mode=json", 200, nil,
`{"links":{},"origin":"https://localhost:8089/services/cluster/master/info","updated":"2020-03-18T01:04:53+00:00","generator":{"build":"a7f645ddaf91","version":"8.0.2"},"entry":[{"name":"master","id":"https://localhost:8089/services/cluster/master/info/master","updated":"1970-01-01T00:00:00+00:00","links":{"alternate":"/services/cluster/master/info/master","list":"/services/cluster/master/info/master"},"author":"system","acl":{"app":"","can_list":true,"can_write":true,"modifiable":false,"owner":"system","perms":{"read":["admin","splunk-system-role"],"write":["admin","splunk-system-role"]},"removable":false,"sharing":"system"},"content":{"active_bundle":{"bundle_path":"/opt/splunk/var/run/splunk/cluster/remote-bundle/506c58d5aeda1dd6017889e3186e7337-1583870198.bundle","checksum":"14310A4AABD23E85BBD4559C4A3B59F8","timestamp":1583870198},"apply_bundle_status":{"invalid_bundle":{"bundle_path":"","bundle_validation_errors_on_master":[],"checksum":"","timestamp":0},"reload_bundle_issued":false,"status":"None"},"backup_and_restore_primaries":false,"controlled_rolling_restart_flag":false,"eai:acl":null,"indexing_ready_flag":true,"initialized_flag":true,"label":"splunk-stack1-cluster-master-0","last_check_restart_bundle_result":false,"last_dry_run_bundle":{"bundle_path":"","checksum":"","timestamp":0},"last_validated_bundle":{"bundle_path":"/opt/splunk/var/run/splunk/cluster/remote-bundle/0af7c0e95f313f7be3b0cb1d878df9a1-1583948640.bundle","checksum":"14310A4AABD23E85BBD4559C4A3B59F8","is_valid_bundle":true,"timestamp":1583948640},"latest_bundle":{"bundle_path":"/opt/splunk/var/run/splunk/cluster/remote-bundle/506c58d5aeda1dd6017889e3186e7337-1583870198.bundle","checksum":"14310A4AABD23E85BBD4559C4A3B59F8","timestamp":1583870198},"maintenance_mode":false,"multisite":false,"previous_active_bundle":{"bundle_path":"","checksum":"","timestamp":0},"primaries_backup_status":"No on-going (or) completed primaries backup yet. Check back again in few minutes if you expect a backup.","quiet_period_flag":false,"rolling_restart_flag":false,"rolling_restart_or_upgrade":false,"service_ready_flag":true,"start_time":1583948636,"summary_replication":"false"}}],"paging":{"total":1,"perPage":30,"offset":0},"messages":[]}`},
{"GET", "https://splunk-stack1-cluster-master-service.test.svc.cluster.local:8089/services/cluster/master/peers?count=0&output_mode=json", 200, nil,
`{"links":{"create":"/services/cluster/master/peers/_new"},"origin":"https://localhost:8089/services/cluster/master/peers","updated":"2020-03-18T01:08:53+00:00","generator":{"build":"a7f645ddaf91","version":"8.0.2"},"entry":[{"name":"D39B1729-E2C5-4273-B9B2-534DA7C2F866","id":"https://localhost:8089/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866","updated":"1970-01-01T00:00:00+00:00","links":{"alternate":"/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866","list":"/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866","edit":"/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866"},"author":"system","acl":{"app":"","can_list":true,"can_write":true,"modifiable":false,"owner":"system","perms":{"read":["admin","splunk-system-role"],"write":["admin","splunk-system-role"]},"removable":false,"sharing":"system"},"content":{"active_bundle_id":"14310A4AABD23E85BBD4559C4A3B59F8","apply_bundle_status":{"invalid_bundle":{"bundle_validation_errors":[],"invalid_bundle_id":""},"reasons_for_restart":[],"restart_required_for_apply_bundle":false,"status":"None"},"base_generation_id":26,"bucket_count":73,"bucket_count_by_index":{"_audit":24,"_internal":45,"_telemetry":4},"buckets_rf_by_origin_site":{"default":73},"buckets_sf_by_origin_site":{"default":73},"delayed_buckets_to_discard":[],"eai:acl":null,"fixup_set":[],"heartbeat_started":true,"host_port_pair":"10.36.0.6:8089","indexing_disk_space":210707374080,"is_searchable":true,"is_valid_bundle":true,"label":"splunk-stack1-indexer-0","last_dry_run_bundle":"","last_heartbeat":1584493732,"last_validated_bundle":"14310A4AABD23E85BBD4559C4A3B59F8","latest_bundle_id":"14310A4AABD23E85BBD4559C4A3B59F8","peer_registered_summaries":true,"pending_builds":[],"pending_job_count":0,"primary_count":73,"primary_count_remote":0,"register_search_address":"10.36.0.6:8089","replication_count":0,"replication_port":9887,"replication_use_ssl":false,"restart_required_for_applying_dry_run_bundle":false,"search_state_counter":{"PendingSearchable":0,"Searchable":73,"SearchablePendingMask":0,"Unsearchable":0},"site":"default","splunk_version":"8.0.2","status":"Up","status_counter":{"Complete":69,"NonStreamingTarget":0,"StreamingSource":4,"StreamingTarget":0},"summary_replication_count":0}}],"paging":{"total":1,"perPage":30,"offset":0},"messages":[]}`},
{
Method: "GET",
URL: "https://splunk-stack1-cluster-master-service.test.svc.cluster.local:8089/services/cluster/master/info?count=0&output_mode=json",
Status: 200,
Err: nil,
Body: `{"links":{},"origin":"https://localhost:8089/services/cluster/master/info","updated":"2020-03-18T01:04:53+00:00","generator":{"build":"a7f645ddaf91","version":"8.0.2"},"entry":[{"name":"master","id":"https://localhost:8089/services/cluster/master/info/master","updated":"1970-01-01T00:00:00+00:00","links":{"alternate":"/services/cluster/master/info/master","list":"/services/cluster/master/info/master"},"author":"system","acl":{"app":"","can_list":true,"can_write":true,"modifiable":false,"owner":"system","perms":{"read":["admin","splunk-system-role"],"write":["admin","splunk-system-role"]},"removable":false,"sharing":"system"},"content":{"active_bundle":{"bundle_path":"/opt/splunk/var/run/splunk/cluster/remote-bundle/506c58d5aeda1dd6017889e3186e7337-1583870198.bundle","checksum":"14310A4AABD23E85BBD4559C4A3B59F8","timestamp":1583870198},"apply_bundle_status":{"invalid_bundle":{"bundle_path":"","bundle_validation_errors_on_master":[],"checksum":"","timestamp":0},"reload_bundle_issued":false,"status":"None"},"backup_and_restore_primaries":false,"controlled_rolling_restart_flag":false,"eai:acl":null,"indexing_ready_flag":true,"initialized_flag":true,"label":"splunk-stack1-cluster-master-0","last_check_restart_bundle_result":false,"last_dry_run_bundle":{"bundle_path":"","checksum":"","timestamp":0},"last_validated_bundle":{"bundle_path":"/opt/splunk/var/run/splunk/cluster/remote-bundle/0af7c0e95f313f7be3b0cb1d878df9a1-1583948640.bundle","checksum":"14310A4AABD23E85BBD4559C4A3B59F8","is_valid_bundle":true,"timestamp":1583948640},"latest_bundle":{"bundle_path":"/opt/splunk/var/run/splunk/cluster/remote-bundle/506c58d5aeda1dd6017889e3186e7337-1583870198.bundle","checksum":"14310A4AABD23E85BBD4559C4A3B59F8","timestamp":1583870198},"maintenance_mode":false,"multisite":false,"previous_active_bundle":{"bundle_path":"","checksum":"","timestamp":0},"primaries_backup_status":"No on-going (or) completed primaries backup yet. Check back again in few minutes if you expect a backup.","quiet_period_flag":false,"rolling_restart_flag":false,"rolling_restart_or_upgrade":false,"service_ready_flag":true,"start_time":1583948636,"summary_replication":"false"}}],"paging":{"total":1,"perPage":30,"offset":0},"messages":[]}`,
},
{
Method: "GET",
URL: "https://splunk-stack1-cluster-master-service.test.svc.cluster.local:8089/services/cluster/master/peers?count=0&output_mode=json",
Status: 200,
Err: nil,
Body: `{"links":{"create":"/services/cluster/master/peers/_new"},"origin":"https://localhost:8089/services/cluster/master/peers","updated":"2020-03-18T01:08:53+00:00","generator":{"build":"a7f645ddaf91","version":"8.0.2"},"entry":[{"name":"D39B1729-E2C5-4273-B9B2-534DA7C2F866","id":"https://localhost:8089/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866","updated":"1970-01-01T00:00:00+00:00","links":{"alternate":"/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866","list":"/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866","edit":"/services/cluster/master/peers/D39B1729-E2C5-4273-B9B2-534DA7C2F866"},"author":"system","acl":{"app":"","can_list":true,"can_write":true,"modifiable":false,"owner":"system","perms":{"read":["admin","splunk-system-role"],"write":["admin","splunk-system-role"]},"removable":false,"sharing":"system"},"content":{"active_bundle_id":"14310A4AABD23E85BBD4559C4A3B59F8","apply_bundle_status":{"invalid_bundle":{"bundle_validation_errors":[],"invalid_bundle_id":""},"reasons_for_restart":[],"restart_required_for_apply_bundle":false,"status":"None"},"base_generation_id":26,"bucket_count":73,"bucket_count_by_index":{"_audit":24,"_internal":45,"_telemetry":4},"buckets_rf_by_origin_site":{"default":73},"buckets_sf_by_origin_site":{"default":73},"delayed_buckets_to_discard":[],"eai:acl":null,"fixup_set":[],"heartbeat_started":true,"host_port_pair":"10.36.0.6:8089","indexing_disk_space":210707374080,"is_searchable":true,"is_valid_bundle":true,"label":"splunk-stack1-indexer-0","last_dry_run_bundle":"","last_heartbeat":1584493732,"last_validated_bundle":"14310A4AABD23E85BBD4559C4A3B59F8","latest_bundle_id":"14310A4AABD23E85BBD4559C4A3B59F8","peer_registered_summaries":true,"pending_builds":[],"pending_job_count":0,"primary_count":73,"primary_count_remote":0,"register_search_address":"10.36.0.6:8089","replication_count":0,"replication_port":9887,"replication_use_ssl":false,"restart_required_for_applying_dry_run_bundle":false,"search_state_counter":{"PendingSearchable":0,"Searchable":73,"SearchablePendingMask":0,"Unsearchable":0},"site":"default","splunk_version":"8.0.2","status":"Up","status_counter":{"Complete":69,"NonStreamingTarget":0,"StreamingSource":4,"StreamingTarget":0},"summary_replication_count":0}}],"paging":{"total":1,"perPage":30,"offset":0},"messages":[]}`,
},
}
wantCalls = map[string][]mockFuncCall{"Get": funcCalls}
pod := &corev1.Pod{
Expand All @@ -151,14 +161,24 @@ func TestIndexerClusterPodManager(t *testing.T) {
"controller-revision-hash": "v1",
},
},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
ContainerStatuses: []corev1.ContainerStatus{
{Ready: true},
},
},
}
method := "IndexerClusterPodManager.Update(All pods ready)"
indexerClusterPodManagerTester(t, method, mockHandlers, 1, enterprisev1.PhaseReady, statefulSet, wantCalls, nil, statefulSet, pod)

// test pod needs update => decommission
mockHandlers = append(mockHandlers,
spltest.MockHTTPHandler{"POST", "https://splunk-stack1-indexer-0.splunk-stack1-indexer-headless.test.svc.cluster.local:8089/services/cluster/slave/control/control/decommission?enforce_counts=0", 200, nil, ``},
)
mockHandlers = append(mockHandlers, spltest.MockHTTPHandler{
Method: "POST",
URL: "https://splunk-stack1-indexer-0.splunk-stack1-indexer-headless.test.svc.cluster.local:8089/services/cluster/slave/control/control/decommission?enforce_counts=0",
Status: 200,
Err: nil,
Body: ``,
})
pod.ObjectMeta.Labels["controller-revision-hash"] = "v0"
method = "IndexerClusterPodManager.Update(Decommission Pod)"
indexerClusterPodManagerTester(t, method, mockHandlers, 1, enterprisev1.PhaseUpdating, statefulSet, wantCalls, nil, statefulSet, pod)
Expand Down Expand Up @@ -192,9 +212,13 @@ func TestIndexerClusterPodManager(t *testing.T) {

// test scale down => decommission pod
mockHandlers[1].Body = `{"entry":[{"name":"aa45bf46-7f46-47af-a760-590d5c606d10","content":{"status":"Up","label":"splunk-stack1-indexer-0"}},{"name":"D39B1729-E2C5-4273-B9B2-534DA7C2F866","content":{"status":"GracefulShutdown","label":"splunk-stack1-indexer-1"}}]}`
mockHandlers = append(mockHandlers,
spltest.MockHTTPHandler{"POST", "https://splunk-stack1-cluster-master-service.test.svc.cluster.local:8089/services/cluster/master/control/control/remove_peers?peers=D39B1729-E2C5-4273-B9B2-534DA7C2F866", 200, nil, ``},
)
mockHandlers = append(mockHandlers, spltest.MockHTTPHandler{
Method: "POST",
URL: "https://splunk-stack1-cluster-master-service.test.svc.cluster.local:8089/services/cluster/master/control/control/remove_peers?peers=D39B1729-E2C5-4273-B9B2-534DA7C2F866",
Status: 200,
Err: nil,
Body: ``,
})
pvcCalls := []mockFuncCall{
{metaName: "*v1.PersistentVolumeClaim-test-pvc-etc-splunk-stack1-1"},
{metaName: "*v1.PersistentVolumeClaim-test-pvc-var-splunk-stack1-1"},
Expand Down
60 changes: 41 additions & 19 deletions pkg/splunk/reconcile/searchheadcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,14 @@ func ApplySearchHeadCluster(client ControllerClient, cr *enterprisev1.SearchHead
cr.Status.DeployerPhase = enterprisev1.PhaseError
cr.Status.Replicas = cr.Spec.Replicas
cr.Status.Selector = fmt.Sprintf("app.kubernetes.io/instance=splunk-%s-search-head", cr.GetIdentifier())
if cr.Status.Members == nil {
cr.Status.Members = []enterprisev1.SearchHeadClusterMemberStatus{}
}
defer func() {
client.Status().Update(context.TODO(), cr)
err = client.Status().Update(context.TODO(), cr)
if err != nil {
scopedLog.Error(err, "Status update failed")
}
}()

// check if deletion has been requested
Expand Down Expand Up @@ -189,6 +195,10 @@ func (mgr *SearchHeadClusterPodManager) PrepareRecycle(n int32) (bool, error) {
mgr.log.Info("Waiting for active searches to complete", "memberName", memberName)
}
return searchesComplete, nil

case "": // this can happen after the member has already been recycled and we're just waiting for state to update
mgr.log.Info("Member has empty Status", "memberName", memberName)
return false, nil
}

// unhandled status
Expand Down Expand Up @@ -226,13 +236,14 @@ func (mgr *SearchHeadClusterPodManager) getClient(n int32) *splclient.SplunkClie
// updateStatus for SearchHeadClusterPodManager uses the REST API to update the status for a SearcHead custom resource
func (mgr *SearchHeadClusterPodManager) updateStatus(statefulSet *appsv1.StatefulSet) error {
// populate members status using REST API to get search head cluster member info
mgr.cr.Status.Captain = ""
mgr.cr.Status.CaptainReady = false
mgr.cr.Status.ReadyReplicas = statefulSet.Status.ReadyReplicas
if mgr.cr.Status.ReadyReplicas == 0 {
return nil
}
gotCaptainInfo := false
mgr.cr.Status.Members = []enterprisev1.SearchHeadClusterMemberStatus{}
for n := int32(0); n < mgr.cr.Status.ReadyReplicas; n++ {
for n := int32(0); n < statefulSet.Status.Replicas; n++ {
c := mgr.getClient(n)
memberName := enterprise.GetSplunkStatefulsetPodName(enterprise.SplunkSearchHead, mgr.cr.GetIdentifier(), n)
memberStatus := enterprisev1.SearchHeadClusterMemberStatus{Name: memberName}
Expand All @@ -243,24 +254,35 @@ func (mgr *SearchHeadClusterPodManager) updateStatus(statefulSet *appsv1.Statefu
memberStatus.Registered = memberInfo.Registered
memberStatus.ActiveHistoricalSearchCount = memberInfo.ActiveHistoricalSearchCount
memberStatus.ActiveRealtimeSearchCount = memberInfo.ActiveRealtimeSearchCount
if !gotCaptainInfo {
// try querying captain api; note that this should work on any node
captainInfo, err := c.GetSearchHeadCaptainInfo()
if err == nil {
mgr.cr.Status.Captain = captainInfo.Label
mgr.cr.Status.CaptainReady = captainInfo.ServiceReady
mgr.cr.Status.Initialized = captainInfo.Initialized
mgr.cr.Status.MinPeersJoined = captainInfo.MinPeersJoined
mgr.cr.Status.MaintenanceMode = captainInfo.MaintenanceMode
gotCaptainInfo = true
}
}
} else if n < statefulSet.Status.Replicas {
// ignore error if pod was just terminated for scale down event (n >= Replicas)
} else {
mgr.log.Error(err, "Unable to retrieve search head cluster member info", "memberName", memberName)
return err
}
mgr.cr.Status.Members = append(mgr.cr.Status.Members, memberStatus)

if err == nil && !gotCaptainInfo {
// try querying captain api; note that this should work on any node
captainInfo, err := c.GetSearchHeadCaptainInfo()
if err == nil {
mgr.cr.Status.Captain = captainInfo.Label
mgr.cr.Status.CaptainReady = captainInfo.ServiceReady
mgr.cr.Status.Initialized = captainInfo.Initialized
mgr.cr.Status.MinPeersJoined = captainInfo.MinPeersJoined
mgr.cr.Status.MaintenanceMode = captainInfo.MaintenanceMode
gotCaptainInfo = true
} else {
mgr.log.Error(err, "Unable to retrieve captain info", "memberName", memberName)
}
}

if n < int32(len(mgr.cr.Status.Members)) {
mgr.cr.Status.Members[n] = memberStatus
} else {
mgr.cr.Status.Members = append(mgr.cr.Status.Members, memberStatus)
}
}

// truncate any extra members that we didn't check (leftover from scale down)
if statefulSet.Status.Replicas < int32(len(mgr.cr.Status.Members)) {
mgr.cr.Status.Members = mgr.cr.Status.Members[:statefulSet.Status.Replicas]
}

return nil
Expand Down
Loading

0 comments on commit 78b6874

Please sign in to comment.