Skip to content

Commit

Permalink
feat: improve api-server and controller performance (#3222)
Browse files Browse the repository at this point in the history
* group read comparison settings during app reconciliation
* Reduce lock contention in clusterInfo::ensureSynced(). Add getRepoObj stats
* Remove additional source of lock contention
* Exclude the coordination.k8s.io/Lease resource

Co-authored-by: Alexander Matyushentsev <amatyushentsev@gmail.com>
  • Loading branch information
jessesuen and alexmt authored Mar 16, 2020
1 parent 487d664 commit 476b09c
Show file tree
Hide file tree
Showing 19 changed files with 335 additions and 222 deletions.
2 changes: 2 additions & 0 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ test:
.PHONY: test-e2e
test-e2e:
# NO_PROXY ensures all tests don't go out through a proxy if one is configured on the test system
NO_PROXY=* ./hack/test.sh -timeout 15m ./test/e2e
NO_PROXY=* ./hack/test.sh -timeout 15m -v ./test/e2e

.PHONY: start-e2e
start-e2e: cli
Expand Down
2 changes: 1 addition & 1 deletion cmd/argocd-application-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"time"

"github.com/argoproj/pkg/stats"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"k8s.io/client-go/kubernetes"
Expand All @@ -25,7 +26,6 @@ import (
"github.com/argoproj/argo-cd/util/cli"
"github.com/argoproj/argo-cd/util/kube"
"github.com/argoproj/argo-cd/util/settings"
"github.com/argoproj/argo-cd/util/stats"
)

const (
Expand Down
2 changes: 1 addition & 1 deletion cmd/argocd-repo-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"os"
"time"

"github.com/argoproj/pkg/stats"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"

Expand All @@ -16,7 +17,6 @@ import (
reposervercache "github.com/argoproj/argo-cd/reposerver/cache"
"github.com/argoproj/argo-cd/reposerver/metrics"
"github.com/argoproj/argo-cd/util/cli"
"github.com/argoproj/argo-cd/util/stats"
"github.com/argoproj/argo-cd/util/tls"
)

Expand Down
2 changes: 1 addition & 1 deletion cmd/argocd-server/commands/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"time"

"github.com/argoproj/pkg/stats"
"github.com/spf13/cobra"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
Expand All @@ -15,7 +16,6 @@ import (
"github.com/argoproj/argo-cd/server"
servercache "github.com/argoproj/argo-cd/server/cache"
"github.com/argoproj/argo-cd/util/cli"
"github.com/argoproj/argo-cd/util/stats"
"github.com/argoproj/argo-cd/util/tls"
)

Expand Down
17 changes: 9 additions & 8 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -821,22 +821,20 @@ func (ctrl *ApplicationController) processAppRefreshQueueItem() (processNext boo
return
}

app := origApp.DeepCopy()
logCtx := log.WithFields(log.Fields{"application": app.Name})
startTime := time.Now()
defer func() {
reconcileDuration := time.Since(startTime)
ctrl.metricsServer.IncReconcile(origApp, reconcileDuration)
logCtx := log.WithFields(log.Fields{
"application": origApp.Name,
"time_ms": reconcileDuration.Seconds() * 1e3,
logCtx.WithFields(log.Fields{
"time_ms": reconcileDuration.Milliseconds(),
"level": comparisonLevel,
"dest-server": origApp.Spec.Destination.Server,
"dest-namespace": origApp.Spec.Destination.Namespace,
})
logCtx.Info("Reconciliation completed")
}).Info("Reconciliation completed")
}()

app := origApp.DeepCopy()
logCtx := log.WithFields(log.Fields{"application": app.Name})
if comparisonLevel == ComparisonWithNothing {
managedResources := make([]*appv1.ResourceDiff, 0)
if err := ctrl.cache.GetAppManagedResources(app.Name, &managedResources); err != nil {
Expand Down Expand Up @@ -888,6 +886,9 @@ func (ctrl *ApplicationController) processAppRefreshQueueItem() (processNext boo

observedAt := metav1.Now()
compareResult := ctrl.appStateManager.CompareAppState(app, project, revision, app.Spec.Source, refreshType == appv1.RefreshTypeHard, localManifests)
for k, v := range compareResult.timings {
logCtx = logCtx.WithField(k, v.Milliseconds())
}

ctrl.normalizeApplication(origApp, app)

Expand All @@ -912,7 +913,7 @@ func (ctrl *ApplicationController) processAppRefreshQueueItem() (processNext boo
)
}
} else {
logCtx.Infof("Sync prevented by sync window")
logCtx.Info("Sync prevented by sync window")
}

if app.Status.ReconciledAt == nil || comparisonLevel == CompareWithLatest {
Expand Down
48 changes: 27 additions & 21 deletions controller/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func NewLiveStateCache(
appInformer: appInformer,
db: db,
clusters: make(map[string]*clusterInfo),
lock: &sync.Mutex{},
lock: &sync.RWMutex{},
onObjectUpdated: onObjectUpdated,
kubectl: kubectl,
settingsMgr: settingsMgr,
Expand All @@ -82,7 +82,7 @@ func NewLiveStateCache(
type liveStateCache struct {
db db.ArgoDB
clusters map[string]*clusterInfo
lock *sync.Mutex
lock *sync.RWMutex
appInformer cache.SharedIndexInformer
onObjectUpdated ObjectUpdatedHandler
kubectl kube.Kubectl
Expand All @@ -109,31 +109,35 @@ func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) {
}

func (c *liveStateCache) getCluster(server string) (*clusterInfo, error) {
c.lock.Lock()
defer c.lock.Unlock()
c.lock.RLock()
info, ok := c.clusters[server]
c.lock.RUnlock()
if !ok {
logCtx := log.WithField("server", server)
logCtx.Info("initializing cluster")
cluster, err := c.db.GetCluster(context.Background(), server)
if err != nil {
return nil, err
}
info = &clusterInfo{
apisMeta: make(map[schema.GroupKind]*apiMeta),
lock: &sync.Mutex{},
lock: &sync.RWMutex{},
nodes: make(map[kube.ResourceKey]*node),
nsIndex: make(map[string]map[kube.ResourceKey]*node),
onObjectUpdated: c.onObjectUpdated,
kubectl: c.kubectl,
cluster: cluster,
syncTime: nil,
log: log.WithField("server", cluster.Server),
log: logCtx,
cacheSettingsSrc: c.getCacheSettings,
onEventReceived: func(event watch.EventType, un *unstructured.Unstructured) {
c.metricsServer.IncClusterEventsCount(cluster.Server)
gvk := un.GroupVersionKind()
c.metricsServer.IncClusterEventsCount(cluster.Server, gvk.Group, gvk.Kind)
},
}

c.lock.Lock()
c.clusters[cluster.Server] = info
c.lock.Unlock()
}
return info, nil
}
Expand All @@ -152,8 +156,8 @@ func (c *liveStateCache) getSyncedCluster(server string) (*clusterInfo, error) {

func (c *liveStateCache) Invalidate() {
log.Info("invalidating live state cache")
c.lock.Lock()
defer c.lock.Unlock()
c.lock.RLock()
defer c.lock.RLock()
for _, clust := range c.clusters {
clust.invalidate()
}
Expand Down Expand Up @@ -210,8 +214,6 @@ func isClusterHasApps(apps []interface{}, cluster *appv1.Cluster) bool {
}

func (c *liveStateCache) getCacheSettings() *cacheSettings {
c.cacheSettingsLock.Lock()
defer c.cacheSettingsLock.Unlock()
return c.cacheSettings
}

Expand Down Expand Up @@ -261,20 +263,24 @@ func (c *liveStateCache) Run(ctx context.Context) error {
util.RetryUntilSucceed(func() error {
clusterEventCallback := func(event *db.ClusterEvent) {
c.lock.Lock()
defer c.lock.Unlock()
if cluster, ok := c.clusters[event.Cluster.Server]; ok {
cluster, ok := c.clusters[event.Cluster.Server]
if ok {
defer c.lock.Unlock()
if event.Type == watch.Deleted {
cluster.invalidate()
delete(c.clusters, event.Cluster.Server)
} else if event.Type == watch.Modified {
cluster.cluster = event.Cluster
cluster.invalidate()
}
} else if event.Type == watch.Added && isClusterHasApps(c.appInformer.GetStore().List(), event.Cluster) {
go func() {
// warm up cache for cluster with apps
_, _ = c.getSyncedCluster(event.Cluster.Server)
}()
} else {
c.lock.Unlock()
if event.Type == watch.Added && isClusterHasApps(c.appInformer.GetStore().List(), event.Cluster) {
go func() {
// warm up cache for cluster with apps
_, _ = c.getSyncedCluster(event.Cluster.Server)
}()
}
}
}

Expand All @@ -287,8 +293,8 @@ func (c *liveStateCache) Run(ctx context.Context) error {
}

func (c *liveStateCache) GetClustersInfo() []metrics.ClusterInfo {
c.lock.Lock()
defer c.lock.Unlock()
c.lock.RLock()
defer c.lock.RUnlock()
res := make([]metrics.ClusterInfo, 0)
for _, info := range c.clusters {
res = append(res, info.getClusterInfo())
Expand Down
4 changes: 2 additions & 2 deletions controller/cache/cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ import (
func TestGetServerVersion(t *testing.T) {
now := time.Now()
cache := &liveStateCache{
lock: &sync.Mutex{},
lock: &sync.RWMutex{},
clusters: map[string]*clusterInfo{
"http://localhost": {
syncTime: &now,
lock: &sync.Mutex{},
lock: &sync.RWMutex{},
serverVersion: "123",
},
}}
Expand Down
Loading

0 comments on commit 476b09c

Please sign in to comment.